Update text.py and Transformer.

94872ce6 · guosheng · 57365421 · 57365421 · 57365421 · 57365421
11 changed file
--- a/seq2seq/args.py
+++ b/seq2seq/args.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import distutils.util
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument("--train_data_prefix",
-                        type=str,
-                        help="file prefix for train data")
-    parser.add_argument("--eval_data_prefix",
-                        type=str,
-                        help="file prefix for eval data")
-    parser.add_argument("--test_data_prefix",
-                        type=str,
-                        help="file prefix for test data")
-    parser.add_argument("--vocab_prefix",
-                        type=str,
-                        help="file prefix for vocab")
-    parser.add_argument("--src_lang", type=str, help="source language suffix")
-    parser.add_argument("--tar_lang", type=str, help="target language suffix")
-
-    parser.add_argument("--attention",
-                        type=eval,
-                        default=False,
-                        help="Whether use attention model")
-
-    parser.add_argument("--optimizer",
-                        type=str,
-                        default='adam',
-                        help="optimizer to use, only supprt[sgd|adam]")
-
-    parser.add_argument("--learning_rate",
-                        type=float,
-                        default=0.001,
-                        help="learning rate for optimizer")
-
-    parser.add_argument("--num_layers",
-                        type=int,
-                        default=1,
-                        help="layers number of encoder and decoder")
-    parser.add_argument("--hidden_size",
-                        type=int,
-                        default=100,
-                        help="hidden size of encoder and decoder")
-    parser.add_argument("--src_vocab_size", type=int, help="source vocab size")
-    parser.add_argument("--tar_vocab_size", type=int, help="target vocab size")
-
-    parser.add_argument("--batch_size",
-                        type=int,
-                        help="batch size of each step")
-
-    parser.add_argument("--max_epoch",
-                        type=int,
-                        default=12,
-                        help="max epoch for the training")
-
-    parser.add_argument("--max_len",
-                        type=int,
-                        default=50,
-                        help="max length for source and target sentence")
-    parser.add_argument("--dropout",
-                        type=float,
-                        default=0.0,
-                        help="drop probability")
-    parser.add_argument("--init_scale",
-                        type=float,
-                        default=0.0,
-                        help="init scale for parameter")
-    parser.add_argument("--max_grad_norm",
-                        type=float,
-                        default=5.0,
-                        help="max grad norm for global norm clip")
-
-    parser.add_argument("--model_path",
-                        type=str,
-                        default='model',
-                        help="model path for model to save")
-
-    parser.add_argument("--reload_model",
-                        type=str,
-                        help="reload model to inference")
-
-    parser.add_argument("--infer_file",
-                        type=str,
-                        help="file name for inference")
-    parser.add_argument("--infer_output_file",
-                        type=str,
-                        default='infer_output',
-                        help="file name for inference output")
-    parser.add_argument("--beam_size",
-                        type=int,
-                        default=10,
-                        help="file name for inference")
-
-    parser.add_argument('--use_gpu',
-                        type=eval,
-                        default=False,
-                        help='Whether using gpu [True|False]')
-
-    parser.add_argument('--eager_run',
-                        type=eval,
-                        default=False,
-                        help='Whether to use dygraph')
-
-    parser.add_argument("--enable_ce",
-                        action='store_true',
-                        help="The flag indicating whether to run the task "
-                        "for continuous evaluation.")
-
-    parser.add_argument("--profile",
-                        action='store_true',
-                        help="Whether enable the profile.")
-    # NOTE: profiler args, used for benchmark
-    parser.add_argument(
-        "--profiler_path",
-        type=str,
-        default='./seq2seq.profile',
-        help="the profiler output file path. (used for benchmark)")
-    args = parser.parse_args()
-    return args
--- a/seq2seq/configure.py
+++ b/seq2seq/configure.py
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import sys
-import argparse
-import json
-import yaml
-import six
-import logging
-
-logging_only_message = "%(message)s"
-logging_details = "%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s"
-
-
-class JsonConfig(object):
-    """
-    A high-level api for handling json configure file.
-    """
-
-    def __init__(self, config_path):
-        self._config_dict = self._parse(config_path)
-
-    def _parse(self, config_path):
-        try:
-            with open(config_path) as json_file:
-                config_dict = json.load(json_file)
-        except:
-            raise IOError("Error in parsing bert model config file '%s'" %
-                          config_path)
-        else:
-            return config_dict
-
-    def __getitem__(self, key):
-        return self._config_dict[key]
-
-    def print_config(self):
-        for arg, value in sorted(six.iteritems(self._config_dict)):
-            print('%s: %s' % (arg, value))
-        print('------------------------------------------------')
-
-
-class ArgumentGroup(object):
-    def __init__(self, parser, title, des):
-        self._group = parser.add_argument_group(title=title, description=des)
-
-    def add_arg(self, name, type, default, help, **kwargs):
-        type = str2bool if type == bool else type
-        self._group.add_argument(
-            "--" + name,
-            default=default,
-            type=type,
-            help=help + ' Default: %(default)s.',
-            **kwargs)
-
-
-class ArgConfig(object):
-    """
-    A high-level api for handling argument configs.
-    """
-
-    def __init__(self):
-        parser = argparse.ArgumentParser()
-
-        train_g = ArgumentGroup(parser, "training", "training options.")
-        train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.")
-        train_g.add_arg("learning_rate", float, 5e-5,
-                        "Learning rate used to train with warmup.")
-        train_g.add_arg(
-            "lr_scheduler",
-            str,
-            "linear_warmup_decay",
-            "scheduler of learning rate.",
-            choices=['linear_warmup_decay', 'noam_decay'])
-        train_g.add_arg("weight_decay", float, 0.01,
-                        "Weight decay rate for L2 regularizer.")
-        train_g.add_arg(
-            "warmup_proportion", float, 0.1,
-            "Proportion of training steps to perform linear learning rate warmup for."
-        )
-        train_g.add_arg("save_steps", int, 1000,
-                        "The steps interval to save checkpoints.")
-        train_g.add_arg("use_fp16", bool, False,
-                        "Whether to use fp16 mixed precision training.")
-        train_g.add_arg(
-            "loss_scaling", float, 1.0,
-            "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled."
-        )
-        train_g.add_arg("pred_dir", str, None,
-                        "Path to save the prediction results")
-
-        log_g = ArgumentGroup(parser, "logging", "logging related.")
-        log_g.add_arg("skip_steps", int, 10,
-                      "The steps interval to print loss.")
-        log_g.add_arg("verbose", bool, False, "Whether to output verbose log.")
-
-        run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
-        run_type_g.add_arg("use_cuda", bool, True,
-                           "If set, use GPU for training.")
-        run_type_g.add_arg(
-            "use_fast_executor", bool, False,
-            "If set, use fast parallel executor (in experiment).")
-        run_type_g.add_arg(
-            "num_iteration_per_drop_scope", int, 1,
-            "Ihe iteration intervals to clean up temporary variables.")
-        run_type_g.add_arg("do_train", bool, True,
-                           "Whether to perform training.")
-        run_type_g.add_arg("do_predict", bool, True,
-                           "Whether to perform prediction.")
-
-        custom_g = ArgumentGroup(parser, "customize", "customized options.")
-
-        self.custom_g = custom_g
-
-        self.parser = parser
-
-    def add_arg(self, name, dtype, default, descrip):
-        self.custom_g.add_arg(name, dtype, default, descrip)
-
-    def build_conf(self):
-        return self.parser.parse_args()
-
-
-def str2bool(v):
-    # because argparse does not support to parse "true, False" as python
-    # boolean directly
-    return v.lower() in ("true", "t", "1")
-
-
-def print_arguments(args, log=None):
-    if not log:
-        print('-----------  Configuration Arguments -----------')
-        for arg, value in sorted(six.iteritems(vars(args))):
-            print('%s: %s' % (arg, value))
-        print('------------------------------------------------')
-    else:
-        log.info('-----------  Configuration Arguments -----------')
-        for arg, value in sorted(six.iteritems(vars(args))):
-            log.info('%s: %s' % (arg, value))
-        log.info('------------------------------------------------')
-
-
-class PDConfig(object):
-    """
-    A high-level API for managing configuration files in PaddlePaddle.
-    Can jointly work with command-line-arugment, json files and yaml files.
-    """
-
-    def __init__(self, json_file="", yaml_file="", fuse_args=True):
-        """
-            Init funciton for PDConfig.
-            json_file: the path to the json configure file.
-            yaml_file: the path to the yaml configure file.
-            fuse_args: if fuse the json/yaml configs with argparse.
-        """
-        assert isinstance(json_file, str)
-        assert isinstance(yaml_file, str)
-
-        if json_file != "" and yaml_file != "":
-            raise Warning(
-                "json_file and yaml_file can not co-exist for now. please only use one configure file type."
-            )
-            return
-
-        self.args = None
-        self.arg_config = {}
-        self.json_config = {}
-        self.yaml_config = {}
-
-        parser = argparse.ArgumentParser()
-
-        self.default_g = ArgumentGroup(parser, "default", "default options.")
-        self.yaml_g = ArgumentGroup(parser, "yaml", "options from yaml.")
-        self.json_g = ArgumentGroup(parser, "json", "options from json.")
-        self.com_g = ArgumentGroup(parser, "custom", "customized options.")
-
-        self.default_g.add_arg("do_train", bool, False,
-                               "Whether to perform training.")
-        self.default_g.add_arg("do_predict", bool, False,
-                               "Whether to perform predicting.")
-        self.default_g.add_arg("do_eval", bool, False,
-                               "Whether to perform evaluating.")
-        self.default_g.add_arg("do_save_inference_model", bool, False,
-                               "Whether to perform model saving for inference.")
-
-        # NOTE: args for profiler
-        self.default_g.add_arg("is_profiler", int, 0, "the switch of profiler tools. (used for benchmark)")
-        self.default_g.add_arg("profiler_path", str, './', "the profiler output file path. (used for benchmark)")
-        self.default_g.add_arg("max_iter", int, 0, "the max train batch num.(used for benchmark)")
-
-        self.parser = parser
-
-        if json_file != "":
-            self.load_json(json_file, fuse_args=fuse_args)
-
-        if yaml_file:
-            self.load_yaml(yaml_file, fuse_args=fuse_args)
-
-    def load_json(self, file_path, fuse_args=True):
-
-        if not os.path.exists(file_path):
-            raise Warning("the json file %s does not exist." % file_path)
-            return
-
-        with open(file_path, "r") as fin:
-            self.json_config = json.loads(fin.read())
-            fin.close()
-
-        if fuse_args:
-            for name in self.json_config:
-                if isinstance(self.json_config[name], list):
-                    self.json_g.add_arg(
-                        name,
-                        type(self.json_config[name][0]),
-                        self.json_config[name],
-                        "This is from %s" % file_path,
-                        nargs=len(self.json_config[name]))
-                    continue
-                if not isinstance(self.json_config[name], int) \
-                    and not isinstance(self.json_config[name], float) \
-                    and not isinstance(self.json_config[name], str) \
-                    and not isinstance(self.json_config[name], bool):
-
-                    continue
-
-                self.json_g.add_arg(name,
-                                    type(self.json_config[name]),
-                                    self.json_config[name],
-                                    "This is from %s" % file_path)
-
-    def load_yaml(self, file_path, fuse_args=True):
-
-        if not os.path.exists(file_path):
-            raise Warning("the yaml file %s does not exist." % file_path)
-            return
-
-        with open(file_path, "r") as fin:
-            self.yaml_config = yaml.load(fin, Loader=yaml.SafeLoader)
-            fin.close()
-
-        if fuse_args:
-            for name in self.yaml_config:
-                if isinstance(self.yaml_config[name], list):
-                    self.yaml_g.add_arg(
-                        name,
-                        type(self.yaml_config[name][0]),
-                        self.yaml_config[name],
-                        "This is from %s" % file_path,
-                        nargs=len(self.yaml_config[name]))
-                    continue
-
-                if not isinstance(self.yaml_config[name], int) \
-                    and not isinstance(self.yaml_config[name], float) \
-                    and not isinstance(self.yaml_config[name], str) \
-                    and not isinstance(self.yaml_config[name], bool):
-
-                    continue
-
-                self.yaml_g.add_arg(name,
-                                    type(self.yaml_config[name]),
-                                    self.yaml_config[name],
-                                    "This is from %s" % file_path)
-
-    def build(self):
-        self.args = self.parser.parse_args()
-        self.arg_config = vars(self.args)
-
-    def __add__(self, new_arg):
-        assert isinstance(new_arg, list) or isinstance(new_arg, tuple)
-        assert len(new_arg) >= 3
-        assert self.args is None
-
-        name = new_arg[0]
-        dtype = new_arg[1]
-        dvalue = new_arg[2]
-        desc = new_arg[3] if len(
-            new_arg) == 4 else "Description is not provided."
-
-        self.com_g.add_arg(name, dtype, dvalue, desc)
-
-        return self
-
-    def __getattr__(self, name):
-        if name in self.arg_config:
-            return self.arg_config[name]
-
-        if name in self.json_config:
-            return self.json_config[name]
-
-        if name in self.yaml_config:
-            return self.yaml_config[name]
-
-        raise Warning("The argument %s is not defined." % name)
-
-    def Print(self):
-
-        print("-" * 70)
-        for name in self.arg_config:
-            print("%s:\t\t\t\t%s" % (str(name), str(self.arg_config[name])))
-
-        for name in self.json_config:
-            if name not in self.arg_config:
-                print("%s:\t\t\t\t%s" %
-                      (str(name), str(self.json_config[name])))
-
-        for name in self.yaml_config:
-            if name not in self.arg_config:
-                print("%s:\t\t\t\t%s" %
-                      (str(name), str(self.yaml_config[name])))
-
-        print("-" * 70)
-
-
-if __name__ == "__main__":
-    """
-    pd_config = PDConfig(json_file = "./test/bert_config.json")
-    pd_config.build()
-
-    print(pd_config.do_train)
-    print(pd_config.hidden_size)
-
-    pd_config = PDConfig(yaml_file = "./test/bert_config.yaml")
-    pd_config.build()
-
-    print(pd_config.do_train)
-    print(pd_config.hidden_size)
-    """
-
-    pd_config = PDConfig(yaml_file="./test/bert_config.yaml")
-    pd_config += ("my_age", int, 18, "I am forever 18.")
-    pd_config.build()
-
-    print(pd_config.do_train)
-    print(pd_config.hidden_size)
-    print(pd_config.my_age)
--- a/seq2seq/reader.py
+++ b/seq2seq/reader.py
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import glob
-import six
-import os
-import tarfile
-import itertools
-
-import numpy as np
-import paddle.fluid as fluid
-from paddle.fluid.dygraph.parallel import ParallelEnv
-from paddle.fluid.io import BatchSampler, DataLoader, Dataset
-
-
-def prepare_train_input(insts, src_pad_idx, trg_pad_idx):
-    """
-    Put all padded data needed by training into a list.
-    """
-    src, src_length = pad_batch_data([inst[0] for inst in insts], src_pad_idx)
-    trg, trg_length = pad_batch_data([inst[1] for inst in insts], trg_pad_idx)
-    label, _ = pad_batch_data([inst[2] for inst in insts], trg_pad_idx)
-    return src, src_length, trg, trg_length, np.expand_dims(label, -1)
-
-
-def pad_batch_data(insts, pad_idx):
-    """
-    Pad the instances to the max sequence length in batch, and generate the
-    corresponding position data and attention bias.
-    """
-    inst_length = np.array([len(inst) for inst in insts], dtype="int64")
-    max_len = np.max(inst_length)
-    inst_data = np.array(
-        [inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
-    return inst_data, inst_length
-
-
-class SortType(object):
-    GLOBAL = 'global'
-    POOL = 'pool'
-    NONE = "none"
-
-
-class Converter(object):
-    def __init__(self, vocab, beg, end, unk, delimiter, add_beg):
-        self._vocab = vocab
-        self._beg = beg
-        self._end = end
-        self._unk = unk
-        self._delimiter = delimiter
-        self._add_beg = add_beg
-
-    def __call__(self, sentence):
-        return ([self._beg] if self._add_beg else []) + [
-            self._vocab.get(w, self._unk)
-            for w in sentence.split(self._delimiter)
-        ] + [self._end]
-
-
-class ComposedConverter(object):
-    def __init__(self, converters):
-        self._converters = converters
-
-    def __call__(self, parallel_sentence):
-        return [
-            self._converters[i](parallel_sentence[i])
-            for i in range(len(self._converters))
-        ]
-
-
-class SentenceBatchCreator(object):
-    def __init__(self, batch_size):
-        self.batch = []
-        self._batch_size = batch_size
-
-    def append(self, info):
-        self.batch.append(info)
-        if len(self.batch) == self._batch_size:
-            tmp = self.batch
-            self.batch = []
-            return tmp
-
-
-class TokenBatchCreator(object):
-    def __init__(self, batch_size):
-        self.batch = []
-        self.max_len = -1
-        self._batch_size = batch_size
-
-    def append(self, info):
-        cur_len = info.max_len
-        max_len = max(self.max_len, cur_len)
-        if max_len * (len(self.batch) + 1) > self._batch_size:
-            result = self.batch
-            self.batch = [info]
-            self.max_len = cur_len
-            return result
-        else:
-            self.max_len = max_len
-            self.batch.append(info)
-
-
-class SampleInfo(object):
-    def __init__(self, i, max_len, min_len):
-        self.i = i
-        self.min_len = min_len
-        self.max_len = max_len
-
-
-class MinMaxFilter(object):
-    def __init__(self, max_len, min_len, underlying_creator):
-        self._min_len = min_len
-        self._max_len = max_len
-        self._creator = underlying_creator
-
-    def append(self, info):
-        if info.max_len > self._max_len or info.min_len < self._min_len:
-            return
-        else:
-            return self._creator.append(info)
-
-    @property
-    def batch(self):
-        return self._creator.batch
-
-
-class Seq2SeqDataset(Dataset):
-    def __init__(self,
-                 src_vocab_fpath,
-                 trg_vocab_fpath,
-                 fpattern,
-                 tar_fname=None,
-                 field_delimiter="\t",
-                 token_delimiter=" ",
-                 start_mark="<s>",
-                 end_mark="<e>",
-                 unk_mark="<unk>",
-                 only_src=False):
-        # convert str to bytes, and use byte data
-        field_delimiter = field_delimiter.encode("utf8")
-        token_delimiter = token_delimiter.encode("utf8")
-        start_mark = start_mark.encode("utf8")
-        end_mark = end_mark.encode("utf8")
-        unk_mark = unk_mark.encode("utf8")
-        self._src_vocab = self.load_dict(src_vocab_fpath)
-        self._trg_vocab = self.load_dict(trg_vocab_fpath)
-        self._bos_idx = self._src_vocab[start_mark]
-        self._eos_idx = self._src_vocab[end_mark]
-        self._unk_idx = self._src_vocab[unk_mark]
-        self._only_src = only_src
-        self._field_delimiter = field_delimiter
-        self._token_delimiter = token_delimiter
-        self.load_src_trg_ids(fpattern, tar_fname)
-
-    def load_src_trg_ids(self, fpattern, tar_fname):
-        converters = [
-            Converter(vocab=self._src_vocab,
-                      beg=self._bos_idx,
-                      end=self._eos_idx,
-                      unk=self._unk_idx,
-                      delimiter=self._token_delimiter,
-                      add_beg=False)
-        ]
-        if not self._only_src:
-            converters.append(
-                Converter(vocab=self._trg_vocab,
-                          beg=self._bos_idx,
-                          end=self._eos_idx,
-                          unk=self._unk_idx,
-                          delimiter=self._token_delimiter,
-                          add_beg=True))
-
-        converters = ComposedConverter(converters)
-
-        self._src_seq_ids = []
-        self._trg_seq_ids = None if self._only_src else []
-        self._sample_infos = []
-
-        for i, line in enumerate(self._load_lines(fpattern, tar_fname)):
-            src_trg_ids = converters(line)
-            self._src_seq_ids.append(src_trg_ids[0])
-            lens = [len(src_trg_ids[0])]
-            if not self._only_src:
-                self._trg_seq_ids.append(src_trg_ids[1])
-                lens.append(len(src_trg_ids[1]))
-            self._sample_infos.append(SampleInfo(i, max(lens), min(lens)))
-
-    def _load_lines(self, fpattern, tar_fname):
-        fpaths = glob.glob(fpattern)
-        assert len(fpaths) > 0, "no matching file to the provided data path"
-
-        if len(fpaths) == 1 and tarfile.is_tarfile(fpaths[0]):
-            if tar_fname is None:
-                raise Exception("If tar file provided, please set tar_fname.")
-
-            f = tarfile.open(fpaths[0], "rb")
-            for line in f.extractfile(tar_fname):
-                fields = line.strip(b"\n").split(self._field_delimiter)
-                if (not self._only_src
-                        and len(fields) == 2) or (self._only_src
-                                                  and len(fields) == 1):
-                    yield fields
-        else:
-            for fpath in fpaths:
-                if not os.path.isfile(fpath):
-                    raise IOError("Invalid file: %s" % fpath)
-
-                with open(fpath, "rb") as f:
-                    for line in f:
-                        fields = line.strip(b"\n").split(self._field_delimiter)
-                        if (not self._only_src and len(fields) == 2) or (
-                                self._only_src and len(fields) == 1):
-                            yield fields
-
-    @staticmethod
-    def load_dict(dict_path, reverse=False):
-        word_dict = {}
-        with open(dict_path, "rb") as fdict:
-            for idx, line in enumerate(fdict):
-                if reverse:
-                    word_dict[idx] = line.strip(b"\n")
-                else:
-                    word_dict[line.strip(b"\n")] = idx
-        return word_dict
-
-    def get_vocab_summary(self):
-        return len(self._src_vocab), len(
-            self._trg_vocab), self._bos_idx, self._eos_idx, self._unk_idx
-
-    def __getitem__(self, idx):
-        return (self._src_seq_ids[idx], self._trg_seq_ids[idx][:-1],
-                self._trg_seq_ids[idx][1:]
-                ) if not self._only_src else self._src_seq_ids[idx]
-
-    def __len__(self):
-        return len(self._sample_infos)
-
-
-class Seq2SeqBatchSampler(BatchSampler):
-    def __init__(self,
-                 dataset,
-                 batch_size,
-                 pool_size,
-                 sort_type=SortType.GLOBAL,
-                 min_length=0,
-                 max_length=100,
-                 shuffle=True,
-                 shuffle_batch=False,
-                 use_token_batch=False,
-                 clip_last_batch=False,
-                 seed=0):
-        for arg, value in locals().items():
-            if arg != "self":
-                setattr(self, "_" + arg, value)
-        self._random = np.random
-        self._random.seed(seed)
-        # for multi-devices
-        self._nranks = ParallelEnv().nranks
-        self._local_rank = ParallelEnv().local_rank
-        self._device_id = ParallelEnv().dev_id
-
-    def __iter__(self):
-        # global sort or global shuffle
-        if self._sort_type == SortType.GLOBAL:
-            infos = sorted(self._dataset._sample_infos,
-                           key=lambda x: x.max_len)
-        else:
-            if self._shuffle:
-                infos = self._dataset._sample_infos
-                self._random.shuffle(infos)
-            else:
-                infos = self._dataset._sample_infos
-
-            if self._sort_type == SortType.POOL:
-                reverse = True
-                for i in range(0, len(infos), self._pool_size):
-                    # to avoid placing short next to long sentences
-                    reverse = not reverse
-                    infos[i:i + self._pool_size] = sorted(
-                        infos[i:i + self._pool_size],
-                        key=lambda x: x.max_len,
-                        reverse=reverse)
-
-        batches = []
-        batch_creator = TokenBatchCreator(
-            self._batch_size
-        ) if self._use_token_batch else SentenceBatchCreator(self._batch_size *
-                                                             self._nranks)
-        batch_creator = MinMaxFilter(self._max_length, self._min_length,
-                                     batch_creator)
-
-        for info in infos:
-            batch = batch_creator.append(info)
-            if batch is not None:
-                batches.append(batch)
-
-        if not self._clip_last_batch and len(batch_creator.batch) != 0:
-            batches.append(batch_creator.batch)
-
-        if self._shuffle_batch:
-            self._random.shuffle(batches)
-
-        if not self._use_token_batch:
-            # when producing batches according to sequence number, to confirm
-            # neighbor batches which would be feed and run parallel have similar
-            # length (thus similar computational cost) after shuffle, we as take
-            # them as a whole when shuffling and split here
-            batches = [[
-                batch[self._batch_size * i:self._batch_size * (i + 1)]
-                for i in range(self._nranks)
-            ] for batch in batches]
-            batches = itertools.chain.from_iterable(batches)
-
-        # for multi-device
-        for batch_id, batch in enumerate(batches):
-            if batch_id % self._nranks == self._local_rank:
-                batch_indices = [info.i for info in batch]
-                yield batch_indices
-        if self._local_rank > len(batches) % self._nranks:
-            yield batch_indices
-
-    def __len__(self):
-        return 100
--- a/seq2seq/seq2seq.py
+++ b/seq2seq/seq2seq.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-from paddle.fluid import ParamAttr
-from paddle.fluid.initializer import UniformInitializer
-from paddle.fluid.dygraph import Embedding, Linear, Layer
-from rnn_api import DynamicDecode, RNN, BasicLSTMCell, RNNCell
-from model import Model, Loss
-
-
-class CrossEntropyCriterion(Loss):
-    def __init__(self):
-        super(CrossEntropyCriterion, self).__init__()
-
-    def forward(self, outputs, labels):
-        (predict, mask), label = outputs, labels[0]
-
-        cost = layers.softmax_with_cross_entropy(logits=predict,
-                                                 label=label,
-                                                 soft_label=False)
-        masked_cost = layers.elementwise_mul(cost, mask, axis=0)
-        batch_mean_cost = layers.reduce_mean(masked_cost, dim=[0])
-        seq_cost = layers.reduce_sum(batch_mean_cost)
-        return seq_cost
-
-
-class EncoderCell(RNNCell):
-    def __init__(self,
-                 num_layers,
-                 input_size,
-                 hidden_size,
-                 dropout_prob=0.,
-                 init_scale=0.1):
-        super(EncoderCell, self).__init__()
-        self.dropout_prob = dropout_prob
-        # use add_sublayer to add multi-layers
-        self.lstm_cells = []
-        for i in range(num_layers):
-            self.lstm_cells.append(
-                self.add_sublayer(
-                    "lstm_%d" % i,
-                    BasicLSTMCell(
-                        input_size=input_size if i == 0 else hidden_size,
-                        hidden_size=hidden_size,
-                        param_attr=ParamAttr(initializer=UniformInitializer(
-                            low=-init_scale, high=init_scale)))))
-
-    def forward(self, step_input, states):
-        new_states = []
-        for i, lstm_cell in enumerate(self.lstm_cells):
-            out, new_state = lstm_cell(step_input, states[i])
-            step_input = layers.dropout(
-                out, self.dropout_prob) if self.dropout_prob > 0 else out
-            new_states.append(new_state)
-        return step_input, new_states
-
-    @property
-    def state_shape(self):
-        return [cell.state_shape for cell in self.lstm_cells]
-
-
-class Encoder(Layer):
-    def __init__(self,
-                 vocab_size,
-                 embed_dim,
-                 hidden_size,
-                 num_layers,
-                 dropout_prob=0.,
-                 init_scale=0.1):
-        super(Encoder, self).__init__()
-        self.embedder = Embedding(
-            size=[vocab_size, embed_dim],
-            param_attr=ParamAttr(initializer=UniformInitializer(
-                low=-init_scale, high=init_scale)))
-        self.stack_lstm = RNN(EncoderCell(num_layers, embed_dim, hidden_size,
-                                          init_scale),
-                              is_reverse=False,
-                              time_major=False)
-
-    def forward(self, sequence, sequence_length):
-        inputs = self.embedder(sequence)
-        encoder_output, encoder_state = self.stack_lstm(
-            inputs, sequence_length=sequence_length)
-        return encoder_output, encoder_state
-
-
-class AttentionLayer(Layer):
-    def __init__(self, hidden_size, bias=False, init_scale=0.1):
-        super(AttentionLayer, self).__init__()
-        self.input_proj = Linear(
-            hidden_size,
-            hidden_size,
-            param_attr=ParamAttr(initializer=UniformInitializer(
-                low=-init_scale, high=init_scale)),
-            bias_attr=bias)
-        self.output_proj = Linear(
-            hidden_size + hidden_size,
-            hidden_size,
-            param_attr=ParamAttr(initializer=UniformInitializer(
-                low=-init_scale, high=init_scale)),
-            bias_attr=bias)
-
-    def forward(self, hidden, encoder_output, encoder_padding_mask):
-        query = self.input_proj(hidden)
-        attn_scores = layers.matmul(layers.unsqueeze(query, [1]),
-                                    encoder_output,
-                                    transpose_y=True)
-        if encoder_padding_mask is not None:
-            attn_scores = layers.elementwise_add(attn_scores,
-                                                 encoder_padding_mask)
-        attn_scores = layers.softmax(attn_scores)
-        attn_out = layers.squeeze(layers.matmul(attn_scores, encoder_output),
-                                  [1])
-        attn_out = layers.concat([attn_out, hidden], 1)
-        attn_out = self.output_proj(attn_out)
-        return attn_out
-
-
-class DecoderCell(RNNCell):
-    def __init__(self,
-                 num_layers,
-                 input_size,
-                 hidden_size,
-                 dropout_prob=0.,
-                 init_scale=0.1):
-        super(DecoderCell, self).__init__()
-        self.dropout_prob = dropout_prob
-        # use add_sublayer to add multi-layers
-        self.lstm_cells = []
-        for i in range(num_layers):
-            self.lstm_cells.append(
-                self.add_sublayer(
-                    "lstm_%d" % i,
-                    BasicLSTMCell(input_size=input_size +
-                                  hidden_size if i == 0 else hidden_size,
-                                  hidden_size=hidden_size)))
-        self.attention_layer = AttentionLayer(hidden_size)
-
-    def forward(self,
-                step_input,
-                states,
-                encoder_output,
-                encoder_padding_mask=None):
-        lstm_states, input_feed = states
-        new_lstm_states = []
-        step_input = layers.concat([step_input, input_feed], 1)
-        for i, lstm_cell in enumerate(self.lstm_cells):
-            out, new_lstm_state = lstm_cell(step_input, lstm_states[i])
-            step_input = layers.dropout(
-                out, self.dropout_prob) if self.dropout_prob > 0 else out
-            new_lstm_states.append(new_lstm_state)
-        out = self.attention_layer(step_input, encoder_output,
-                                   encoder_padding_mask)
-        return out, [new_lstm_states, out]
-
-
-class Decoder(Layer):
-    def __init__(self,
-                 vocab_size,
-                 embed_dim,
-                 hidden_size,
-                 num_layers,
-                 dropout_prob=0.,
-                 init_scale=0.1):
-        super(Decoder, self).__init__()
-        self.embedder = Embedding(
-            size=[vocab_size, embed_dim],
-            param_attr=ParamAttr(initializer=UniformInitializer(
-                low=-init_scale, high=init_scale)))
-        self.lstm_attention = RNN(DecoderCell(num_layers, embed_dim,
-                                              hidden_size, init_scale),
-                                  is_reverse=False,
-                                  time_major=False)
-        self.output_layer = Linear(
-            hidden_size,
-            vocab_size,
-            param_attr=ParamAttr(initializer=UniformInitializer(
-                low=-init_scale, high=init_scale)),
-            bias_attr=False)
-
-    def forward(self, target, decoder_initial_states, encoder_output,
-                encoder_padding_mask):
-        inputs = self.embedder(target)
-        decoder_output, _ = self.lstm_attention(
-            inputs,
-            decoder_initial_states,
-            encoder_output=encoder_output,
-            encoder_padding_mask=encoder_padding_mask)
-        predict = self.output_layer(decoder_output)
-        return predict
-
-
-class Seq2Seq(Model):
-    def __init__(self,
-                 src_vocab_size,
-                 trg_vocab_size,
-                 embed_dim,
-                 hidden_size,
-                 num_layers,
-                 dropout_prob=0.,
-                 init_scale=0.1):
-        super(Seq2Seq, self).__init__()
-        self.hidden_size = hidden_size
-        self.encoder = Encoder(src_vocab_size, embed_dim, hidden_size,
-                               num_layers, dropout_prob, init_scale)
-        self.decoder = Decoder(trg_vocab_size, embed_dim, hidden_size,
-                               num_layers, dropout_prob, init_scale)
-
-    def forward(self, src, src_length, trg, trg_length):
-        # encoder
-        encoder_output, encoder_final_state = self.encoder(src, src_length)
-
-        # decoder initial states: use input_feed and the structure is
-        # [[h,c] * num_layers, input_feed]
-        decoder_initial_states = [
-            encoder_final_state,
-            self.decoder.lstm_attention.cell.get_initial_states(
-                batch_ref=encoder_output, shape=[self.hidden_size])
-        ]
-        # attention mask to avoid paying attention on padddings
-        src_mask = layers.sequence_mask(src_length,
-                                        maxlen=layers.shape(src)[1],
-                                        dtype=encoder_output.dtype)
-        encoder_padding_mask = (src_mask - 1.0) * 1e9
-        encoder_padding_mask = layers.unsqueeze(encoder_padding_mask, [1])
-
-        # decoder with attentioon
-        predict = self.decoder(trg, decoder_initial_states, encoder_output,
-                               encoder_padding_mask)
-
-        # for target padding mask
-        mask = layers.sequence_mask(trg_length,
-                                    maxlen=layers.shape(trg)[1],
-                                    dtype=predict.dtype)
-        return predict, mask
-
-
-class Seq2SeqInferModel(Seq2Seq):
-    def __init__(self,
-                 vocab_size,
-                 embed_dim,
-                 hidden_size,
-                 num_layers,
-                 dropout_prob=0.):
-        pass
--- a/seq2seq/seq2seq.yaml
+++ b/seq2seq/seq2seq.yaml
-# used for continuous evaluation
-enable_ce: False
-
-eager_run: False
-
-# The frequency to save trained models when training.
-save_step: 10000
-# The frequency to fetch and print output when training.
-print_step: 100
-# path of the checkpoint, to resume the previous training
-init_from_checkpoint: ""
-# path of the pretrain model, to better solve the current task
-init_from_pretrain_model: ""
-# path of trained parameter, to make prediction
-init_from_params: "trained_params/step_100000/"
-# the directory for saving model
-save_model: "trained_models"
-# the directory for saving inference model.
-inference_model_dir: "infer_model"
-# Set seed for CE or debug
-random_seed: None
-# The pattern to match training data files.
-training_file: "wmt16_ende_data_bpe/train.tok.clean.bpe.32000.en-de"
-# The pattern to match validation data files.
-validation_file: "wmt16_ende_data_bpe/newstest2014.tok.bpe.32000.en-de"
-# The pattern to match test data files.
-predict_file: "wmt16_ende_data_bpe/newstest2016.tok.bpe.32000.en-de"
-# The file to output the translation results of predict_file to.
-output_file: "predict.txt"
-# The path of vocabulary file of source language.
-src_vocab_fpath: "wmt16_ende_data_bpe/vocab_all.bpe.32000"
-# The path of vocabulary file of target language.
-trg_vocab_fpath: "wmt16_ende_data_bpe/vocab_all.bpe.32000"
-# The <bos>, <eos> and <unk> tokens in the dictionary.
-special_token: ["<s>", "<e>", "<unk>"]
-# max length of sequences
-max_length: 256
-
-# whether to use cuda
-use_cuda: True
-
-# args for reader, see reader.py for details
-token_delimiter: " "
-use_token_batch: True
-pool_size: 200000
-sort_type: "pool"
-shuffle: True
-shuffle_batch: True
-batch_size: 4096
-
-# Hyparams for training:
-# the number of epoches for training
-epoch: 30
-# the hyper parameters for Adam optimizer.
-# This static learning_rate will be multiplied to the LearningRateScheduler
-# derived learning rate the to get the final learning rate.
-learning_rate: 0.001
-
-
-# Hyparams for generation:
-# the parameters for beam search.
-beam_size: 5
-max_out_len: 256
-# the number of decoded sentences to output.
-n_best: 1
-
-# Hyparams for model:
-# These following five vocabularies related configurations will be set
-# automatically according to the passed vocabulary path and special tokens.
-# size of source word dictionary.
-src_vocab_size: 10000
-# size of target word dictionay
-trg_vocab_size: 10000
-# index for <bos> token
-bos_idx: 0
-# index for <eos> token
-eos_idx: 1
-# index for <unk> token
-unk_idx: 2
-embed_dim: 512
-hidden_size: 512
-num_layers: 2
-dropout: 0.1
--- a/seq2seq/train.py
+++ b/seq2seq/train.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-import six
-import sys
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-import time
-import contextlib
-from functools import partial
-
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid.dygraph import to_variable
-from paddle.fluid.io import DataLoader
-
-from configure import PDConfig
-from reader import prepare_train_input, Seq2SeqDataset, Seq2SeqBatchSampler
-from seq2seq import Seq2Seq, CrossEntropyCriterion
-from model import Input, set_device
-from callbacks import ProgBarLogger
-
-
-class LoggerCallback(ProgBarLogger):
-    def __init__(self, log_freq=1, verbose=2, loss_normalizer=0.):
-        super(LoggerCallback, self).__init__(log_freq, verbose)
-        # TODO: wrap these override function to simplify
-        self.loss_normalizer = loss_normalizer
-
-    def on_train_begin(self, logs=None):
-        super(LoggerCallback, self).on_train_begin(logs)
-        self.train_metrics += ["normalized loss", "ppl"]
-
-    def on_train_batch_end(self, step, logs=None):
-        logs["normalized loss"] = logs["loss"][0] - self.loss_normalizer
-        logs["ppl"] = np.exp(min(logs["loss"][0], 100))
-        super(LoggerCallback, self).on_train_batch_end(step, logs)
-
-    def on_eval_begin(self, logs=None):
-        super(LoggerCallback, self).on_eval_begin(logs)
-        self.eval_metrics += ["normalized loss", "ppl"]
-
-    def on_eval_batch_end(self, step, logs=None):
-        logs["normalized loss"] = logs["loss"][0] - self.loss_normalizer
-        logs["ppl"] = np.exp(min(logs["loss"][0], 100))
-        super(LoggerCallback, self).on_eval_batch_end(step, logs)
-
-
-def do_train(args):
-    device = set_device("gpu" if args.use_cuda else "cpu")
-    fluid.enable_dygraph(device) if args.eager_run else None
-
-    # set seed for CE
-    random_seed = eval(str(args.random_seed))
-    if random_seed is not None:
-        fluid.default_main_program().random_seed = random_seed
-        fluid.default_startup_program().random_seed = random_seed
-
-    # define model
-    inputs = [
-        Input([None, None], "int64", name="src_word"),
-        Input([None], "int64", name="src_length"),
-        Input([None, None], "int64", name="trg_word"),
-        Input([None], "int64", name="trg_length"),
-    ]
-    labels = [
-        Input([None, None, 1], "int64", name="label"),
-    ]
-
-    model = Seq2Seq(args.src_vocab_size, args.trg_vocab_size, args.embed_dim,
-                    args.hidden_size, args.num_layers, args.dropout)
-
-    model.prepare(fluid.optimizer.Adam(learning_rate=args.learning_rate,
-                                       parameter_list=model.parameters()),
-                  CrossEntropyCriterion(),
-                  inputs=inputs,
-                  labels=labels)
-
-    batch_size = 32
-    src_seq_len = 10
-    trg_seq_len = 12
-    iter_num = 10
-    def random_generator():
-        for i in range(iter_num):
-            src = np.random.randint(2, args.src_vocab_size,
-                                    (batch_size, src_seq_len)).astype("int64")
-            src_length = np.random.randint(
-                1, src_seq_len, (batch_size, )).astype("int64")
-            trg = np.random.randint(2, args.trg_vocab_size,
-                                    (batch_size, trg_seq_len)).astype("int64")
-            trg_length = np.random.randint(1, trg_seq_len,
-                                        (batch_size, )).astype("int64")
-            label = np.random.randint(1, trg_seq_len,
-                                    (batch_size, trg_seq_len, 1)).astype("int64")
-            yield src, src_length, trg, trg_length, label
-
-    model.fit(train_data=random_generator, log_freq=1)
-    exit(0)
-
-
-    dataset = Seq2SeqDataset(fpattern=args.training_file,
-                             src_vocab_fpath=args.src_vocab_fpath,
-                             trg_vocab_fpath=args.trg_vocab_fpath,
-                             token_delimiter=args.token_delimiter,
-                             start_mark=args.special_token[0],
-                             end_mark=args.special_token[1],
-                             unk_mark=args.special_token[2])
-    args.src_vocab_size, args.trg_vocab_size, args.bos_idx, args.eos_idx, \
-        args.unk_idx = dataset.get_vocab_summary()
-    batch_sampler = Seq2SeqBatchSampler(dataset=dataset,
-                                        use_token_batch=args.use_token_batch,
-                                        batch_size=args.batch_size,
-                                        pool_size=args.pool_size,
-                                        sort_type=args.sort_type,
-                                        shuffle=args.shuffle,
-                                        shuffle_batch=args.shuffle_batch,
-                                        max_length=args.max_length)
-    train_loader = DataLoader(dataset=dataset,
-                              batch_sampler=batch_sampler,
-                              places=device,
-                              feed_list=[x.forward() for x in inputs + labels],
-                              collate_fn=partial(prepare_train_input,
-                                                 src_pad_idx=args.eos_idx,
-                                                 trg_pad_idx=args.eos_idx),
-                              num_workers=0,
-                              return_list=True)
-
-    model.fit(train_data=train_loader,
-              eval_data=None,
-              epochs=1,
-              eval_freq=1,
-              save_freq=1,
-              verbose=2,
-              callbacks=[
-                  LoggerCallback(log_freq=args.print_step)
-              ])
-
-
-if __name__ == "__main__":
-    args = PDConfig(yaml_file="./seq2seq.yaml")
-    args.build()
-    args.Print()
-
-    do_train(args)
--- a/rnn_api.py
+++ b/rnn_api.py
@@ -8,11 +8,19 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers.utils as utils
 from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
-from paddle.fluid.dygraph import to_variable, Embedding, Linear
+from paddle.fluid.dygraph import to_variable, Embedding, Linear, LayerNorm
 from paddle.fluid.data_feeder import convert_dtype

 from paddle.fluid import layers
 from paddle.fluid.dygraph import Layer
+from paddle.layers import BeamSearchDecoder
+
+__all__ = [
+    'RNNCell', 'BasicLSTMCell', 'BasicGRUCell', 'RNN', 'DynamicDecode',
+    'BeamSearchDecoder', 'MultiHeadAttention', 'FFN',
+    'TransformerEncoderLayer', 'TransformerEncoder', 'TransformerDecoderLayer',
+    'TransformerDecoder', 'TransformerBeamSearchDecoder'
+]


 class RNNCell(Layer):
@@ -307,14 +315,16 @@ class BasicGRUCell(RNNCell):
            gate_bias_attr = self._bias_attr
            candidate_bias_attr = self._bias_attr

-        self._gate_bias = self.create_parameter(attr=gate_bias_attr,
-                                                shape=[2 * self._hiden_size],
-                                                dtype=self._dtype,
-                                                is_bias=True)
-        self._candidate_bias = self.create_parameter(attr=candidate_bias_attr,
-                                                     shape=[self._hiden_size],
-                                                     dtype=self._dtype,
-                                                     is_bias=True)
+        self._gate_bias = self.create_parameter(
+            attr=gate_bias_attr,
+            shape=[2 * self._hiden_size],
+            dtype=self._dtype,
+            is_bias=True)
+        self._candidate_bias = self.create_parameter(
+            attr=candidate_bias_attr,
+            shape=[self._hiden_size],
+            dtype=self._dtype,
+            is_bias=True)

    def forward(self, input, state):
        pre_hidden = state
@@ -329,8 +339,8 @@ class BasicGRUCell(RNNCell):

        r_hidden = r * pre_hidden

-        candidate = layers.matmul(layers.concat([input, r_hidden], 1),
-                                  self._candidate_weight)
+        candidate = layers.matmul(
+            layers.concat([input, r_hidden], 1), self._candidate_weight)
        candidate = layers.elementwise_add(candidate, self._candidate_bias)

        c = self._activation(candidate)
@@ -643,3 +653,340 @@ class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
                                 beam_search_state.finished)

        return (beam_search_output, beam_search_state, next_inputs, finished)
+
+
+### Transformer Modules ###
+class PrePostProcessLayer(Layer):
+    """
+    PrePostProcessLayer
+    """
+
+    def __init__(self, process_cmd, d_model, dropout_rate):
+        super(PrePostProcessLayer, self).__init__()
+        self.process_cmd = process_cmd
+        self.functors = []
+        for cmd in self.process_cmd:
+            if cmd == "a":  # add residual connection
+                self.functors.append(lambda x, y: x + y if y else x)
+            elif cmd == "n":  # add layer normalization
+                self.functors.append(
+                    self.add_sublayer(
+                        "layer_norm_%d" % len(
+                            self.sublayers(include_sublayers=False)),
+                        LayerNorm(
+                            normalized_shape=d_model,
+                            param_attr=fluid.ParamAttr(
+                                initializer=fluid.initializer.Constant(1.)),
+                            bias_attr=fluid.ParamAttr(
+                                initializer=fluid.initializer.Constant(0.)))))
+            elif cmd == "d":  # add dropout
+                self.functors.append(lambda x: layers.dropout(
+                    x, dropout_prob=dropout_rate, is_test=False)
+                                     if dropout_rate else x)
+
+    def forward(self, x, residual=None):
+        for i, cmd in enumerate(self.process_cmd):
+            if cmd == "a":
+                x = self.functors[i](x, residual)
+            else:
+                x = self.functors[i](x)
+        return x
+
+
+class MultiHeadAttention(Layer):
+    """
+    Multi-Head Attention
+    """
+
+    def __init__(self, d_key, d_value, d_model, n_head=1, dropout_rate=0.):
+        super(MultiHeadAttention, self).__init__()
+        self.n_head = n_head
+        self.d_key = d_key
+        self.d_value = d_value
+        self.d_model = d_model
+        self.dropout_rate = dropout_rate
+        self.q_fc = Linear(
+            input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
+        self.k_fc = Linear(
+            input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
+        self.v_fc = Linear(
+            input_dim=d_model, output_dim=d_value * n_head, bias_attr=False)
+        self.proj_fc = Linear(
+            input_dim=d_value * n_head, output_dim=d_model, bias_attr=False)
+
+    def _prepare_qkv(self, queries, keys, values, cache=None):
+        if keys is None:  # self-attention
+            keys, values = queries, queries
+            static_kv = False
+        else:  # cross-attention
+            static_kv = True
+
+        q = self.q_fc(queries)
+        q = layers.reshape(x=q, shape=[0, 0, self.n_head, self.d_key])
+        q = layers.transpose(x=q, perm=[0, 2, 1, 3])
+
+        if cache is not None and static_kv and "static_k" in cache:
+            # for encoder-decoder attention in inference and has cached
+            k = cache["static_k"]
+            v = cache["static_v"]
+        else:
+            k = self.k_fc(keys)
+            v = self.v_fc(values)
+            k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
+            k = layers.transpose(x=k, perm=[0, 2, 1, 3])
+            v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
+            v = layers.transpose(x=v, perm=[0, 2, 1, 3])
+
+        if cache is not None:
+            if static_kv and not "static_k" in cache:
+                # for encoder-decoder attention in inference and has not cached
+                cache["static_k"], cache["static_v"] = k, v
+            elif not static_kv:
+                # for decoder self-attention in inference
+                cache_k, cache_v = cache["k"], cache["v"]
+                k = layers.concat([cache_k, k], axis=2)
+                v = layers.concat([cache_v, v], axis=2)
+                cache["k"], cache["v"] = k, v
+
+        return q, k, v
+
+    def forward(self, queries, keys, values, attn_bias, cache=None):
+        # compute q ,k ,v
+        q, k, v = self._prepare_qkv(queries, keys, values, cache)
+
+        # scale dot product attention
+        product = layers.matmul(
+            x=q, y=k, transpose_y=True, alpha=self.d_model**-0.5)
+        if attn_bias:
+            product += attn_bias
+        weights = layers.softmax(product)
+        if self.dropout_rate:
+            weights = layers.dropout(
+                weights, dropout_prob=self.dropout_rate, is_test=False)
+
+        out = layers.matmul(weights, v)
+
+        # combine heads
+        out = layers.transpose(out, perm=[0, 2, 1, 3])
+        out = layers.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
+
+        # project to output
+        out = self.proj_fc(out)
+        return out
+
+    def cal_kv(self, keys, values):
+        k = self.k_fc(keys)
+        v = self.v_fc(values)
+        k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
+        k = layers.transpose(x=k, perm=[0, 2, 1, 3])
+        v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
+        v = layers.transpose(x=v, perm=[0, 2, 1, 3])
+        return k, v
+
+
+class FFN(Layer):
+    """
+    Feed-Forward Network
+    """
+
+    def __init__(self, d_inner_hid, d_model, dropout_rate):
+        super(FFN, self).__init__()
+        self.dropout_rate = dropout_rate
+        self.fc1 = Linear(
+            input_dim=d_model, output_dim=d_inner_hid, act="relu")
+        self.fc2 = Linear(input_dim=d_inner_hid, output_dim=d_model)
+
+    def forward(self, x):
+        hidden = self.fc1(x)
+        if self.dropout_rate:
+            hidden = layers.dropout(
+                hidden, dropout_prob=self.dropout_rate, is_test=False)
+        out = self.fc2(hidden)
+        return out
+
+
+class TransformerEncoderLayer(Layer):
+    """
+    EncoderLayer
+    """
+
+    def __init__(self,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da"):
+
+        super(TransformerEncoderLayer, self).__init__()
+
+        self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout)
+        self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
+                                            attention_dropout)
+        self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model,
+                                                  prepostprocess_dropout)
+
+        self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout)
+        self.ffn = FFN(d_inner_hid, d_model, relu_dropout)
+        self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
+                                                  prepostprocess_dropout)
+
+    def forward(self, enc_input, attn_bias):
+        attn_output = self.self_attn(
+            self.preprocesser1(enc_input), None, None, attn_bias)
+        attn_output = self.postprocesser1(attn_output, enc_input)
+
+        ffn_output = self.ffn(self.preprocesser2(attn_output))
+        ffn_output = self.postprocesser2(ffn_output, attn_output)
+        return ffn_output
+
+
+class TransformerEncoder(Layer):
+    """
+    encoder
+    """
+
+    def __init__(self,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da"):
+
+        super(TransformerEncoder, self).__init__()
+
+        self.encoder_layers = list()
+        for i in range(n_layer):
+            self.encoder_layers.append(
+                self.add_sublayer(
+                    "layer_%d" % i,
+                    TransformerEncoderLayer(
+                        n_head, d_key, d_value, d_model, d_inner_hid,
+                        prepostprocess_dropout, attention_dropout,
+                        relu_dropout, preprocess_cmd, postprocess_cmd)))
+        self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
+                                             prepostprocess_dropout)
+
+    def forward(self, enc_input, attn_bias):
+        for encoder_layer in self.encoder_layers:
+            enc_output = encoder_layer(enc_input, attn_bias)
+            enc_input = enc_output
+
+        return self.processer(enc_output)
+
+
+class TransformerDecoderLayer(Layer):
+    """
+    decoder
+    """
+
+    def __init__(self,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da"):
+        super(TransformerDecoderLayer, self).__init__()
+
+        self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout)
+        self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
+                                            attention_dropout)
+        self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model,
+                                                  prepostprocess_dropout)
+
+        self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout)
+        self.cross_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
+                                             attention_dropout)
+        self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
+                                                  prepostprocess_dropout)
+
+        self.preprocesser3 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout)
+        self.ffn = FFN(d_inner_hid, d_model, relu_dropout)
+        self.postprocesser3 = PrePostProcessLayer(postprocess_cmd, d_model,
+                                                  prepostprocess_dropout)
+
+    def forward(self,
+                dec_input,
+                enc_output,
+                self_attn_bias,
+                cross_attn_bias,
+                cache=None):
+        self_attn_output = self.self_attn(
+            self.preprocesser1(dec_input), None, None, self_attn_bias, cache)
+        self_attn_output = self.postprocesser1(self_attn_output, dec_input)
+
+        cross_attn_output = self.cross_attn(
+            self.preprocesser2(self_attn_output), enc_output, enc_output,
+            cross_attn_bias, cache)
+        cross_attn_output = self.postprocesser2(cross_attn_output,
+                                                self_attn_output)
+
+        ffn_output = self.ffn(self.preprocesser3(cross_attn_output))
+        ffn_output = self.postprocesser3(ffn_output, cross_attn_output)
+
+        return ffn_output
+
+
+class TransformerDecoder(Layer):
+    """
+    decoder
+    """
+
+    def __init__(self, n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
+                 prepostprocess_dropout, attention_dropout, relu_dropout,
+                 preprocess_cmd, postprocess_cmd):
+        super(TransformerDecoder, self).__init__()
+
+        self.decoder_layers = list()
+        for i in range(n_layer):
+            self.decoder_layers.append(
+                self.add_sublayer(
+                    "layer_%d" % i,
+                    TransformerDecoderLayer(
+                        n_head, d_key, d_value, d_model, d_inner_hid,
+                        prepostprocess_dropout, attention_dropout,
+                        relu_dropout, preprocess_cmd, postprocess_cmd)))
+        self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
+                                             prepostprocess_dropout)
+
+    def forward(self,
+                dec_input,
+                enc_output,
+                self_attn_bias,
+                cross_attn_bias,
+                caches=None):
+        for i, decoder_layer in enumerate(self.decoder_layers):
+            dec_output = decoder_layer(dec_input, enc_output, self_attn_bias,
+                                       cross_attn_bias, None
+                                       if caches is None else caches[i])
+            dec_input = dec_output
+
+        return self.processer(dec_output)
+
+    def prepare_static_cache(self, enc_output):
+        return [
+            dict(
+                zip(("static_k", "static_v"),
+                    decoder_layer.cross_attn.cal_kv(enc_output, enc_output)))
+            for decoder_layer in self.decoder_layers
+        ]
--- a/transformer/predict.py
+++ b/transformer/predict.py
@@ -55,62 +55,70 @@ def do_predict(args):
    fluid.enable_dygraph(device) if args.eager_run else None

    inputs = [
-        Input([None, None], "int64", name="src_word"),
-        Input([None, None], "int64", name="src_pos"),
-        Input([None, args.n_head, None, None],
-              "float32",
-              name="src_slf_attn_bias"),
-        Input([None, args.n_head, None, None],
-              "float32",
-              name="trg_src_attn_bias"),
+        Input(
+            [None, None], "int64", name="src_word"),
+        Input(
+            [None, None], "int64", name="src_pos"),
+        Input(
+            [None, args.n_head, None, None],
+            "float32",
+            name="src_slf_attn_bias"),
+        Input(
+            [None, args.n_head, None, None],
+            "float32",
+            name="trg_src_attn_bias"),
    ]

    # define data
-    dataset = Seq2SeqDataset(fpattern=args.predict_file,
-                             src_vocab_fpath=args.src_vocab_fpath,
-                             trg_vocab_fpath=args.trg_vocab_fpath,
-                             token_delimiter=args.token_delimiter,
-                             start_mark=args.special_token[0],
-                             end_mark=args.special_token[1],
-                             unk_mark=args.special_token[2])
+    dataset = Seq2SeqDataset(
+        fpattern=args.predict_file,
+        src_vocab_fpath=args.src_vocab_fpath,
+        trg_vocab_fpath=args.trg_vocab_fpath,
+        token_delimiter=args.token_delimiter,
+        start_mark=args.special_token[0],
+        end_mark=args.special_token[1],
+        unk_mark=args.special_token[2])
    args.src_vocab_size, args.trg_vocab_size, args.bos_idx, args.eos_idx, \
        args.unk_idx = dataset.get_vocab_summary()
-    trg_idx2word = Seq2SeqDataset.load_dict(dict_path=args.trg_vocab_fpath,
-                                            reverse=True)
-    batch_sampler = Seq2SeqBatchSampler(dataset=dataset,
-                                        use_token_batch=False,
-                                        batch_size=args.batch_size,
-                                        max_length=args.max_length)
-    data_loader = DataLoader(dataset=dataset,
-                             batch_sampler=batch_sampler,
-                             places=device,
-                             feed_list=[x.forward() for x in inputs],
-                             collate_fn=partial(prepare_infer_input,
-                                                src_pad_idx=args.eos_idx,
-                                                n_head=args.n_head),
-                             num_workers=0,
-                             return_list=True)
+    trg_idx2word = Seq2SeqDataset.load_dict(
+        dict_path=args.trg_vocab_fpath, reverse=True)
+    batch_sampler = Seq2SeqBatchSampler(
+        dataset=dataset,
+        use_token_batch=False,
+        batch_size=args.batch_size,
+        max_length=args.max_length)
+    data_loader = DataLoader(
+        dataset=dataset,
+        batch_sampler=batch_sampler,
+        places=device,
+        feed_list=None
+        if fluid.in_dygraph_mode() else [x.forward() for x in inputs],
+        collate_fn=partial(
+            prepare_infer_input, src_pad_idx=args.eos_idx, n_head=args.n_head),
+        num_workers=0,
+        return_list=True)

    # define model
-    transformer = InferTransformer(args.src_vocab_size,
-                                   args.trg_vocab_size,
-                                   args.max_length + 1,
-                                   args.n_layer,
-                                   args.n_head,
-                                   args.d_key,
-                                   args.d_value,
-                                   args.d_model,
-                                   args.d_inner_hid,
-                                   args.prepostprocess_dropout,
-                                   args.attention_dropout,
-                                   args.relu_dropout,
-                                   args.preprocess_cmd,
-                                   args.postprocess_cmd,
-                                   args.weight_sharing,
-                                   args.bos_idx,
-                                   args.eos_idx,
-                                   beam_size=args.beam_size,
-                                   max_out_len=args.max_out_len)
+    transformer = InferTransformer(
+        args.src_vocab_size,
+        args.trg_vocab_size,
+        args.max_length + 1,
+        args.n_layer,
+        args.n_head,
+        args.d_key,
+        args.d_value,
+        args.d_model,
+        args.d_inner_hid,
+        args.prepostprocess_dropout,
+        args.attention_dropout,
+        args.relu_dropout,
+        args.preprocess_cmd,
+        args.postprocess_cmd,
+        args.weight_sharing,
+        args.bos_idx,
+        args.eos_idx,
+        beam_size=args.beam_size,
+        max_out_len=args.max_out_len)
    transformer.prepare(inputs=inputs)

    # load the trained model
@@ -126,8 +134,7 @@ def do_predict(args):
        for ins in finished_seq:
            for beam_idx, beam in enumerate(ins):
                if beam_idx >= args.n_best: break
-                id_list = post_process_seq(beam, args.bos_idx,
-                                           args.eos_idx)
+                id_list = post_process_seq(beam, args.bos_idx, args.eos_idx)
                word_list = [trg_idx2word[id] for id in id_list]
                sequence = b" ".join(word_list) + b"\n"
                f.write(sequence)

--- a/transformer/rnn_api.py
+++ b/transformer/rnn_api.py
--- a/transformer/train.py
+++ b/transformer/train.py
@@ -71,19 +71,26 @@ def do_train(args):

    # define inputs
    inputs = [
-        Input([None, None], "int64", name="src_word"),
-        Input([None, None], "int64", name="src_pos"),
-        Input([None, args.n_head, None, None],
-              "float32",
-              name="src_slf_attn_bias"),
-        Input([None, None], "int64", name="trg_word"),
-        Input([None, None], "int64", name="trg_pos"),
-        Input([None, args.n_head, None, None],
-              "float32",
-              name="trg_slf_attn_bias"),
-        Input([None, args.n_head, None, None],
-              "float32",
-              name="trg_src_attn_bias"),
+        Input(
+            [None, None], "int64", name="src_word"),
+        Input(
+            [None, None], "int64", name="src_pos"),
+        Input(
+            [None, args.n_head, None, None],
+            "float32",
+            name="src_slf_attn_bias"),
+        Input(
+            [None, None], "int64", name="trg_word"),
+        Input(
+            [None, None], "int64", name="trg_pos"),
+        Input(
+            [None, args.n_head, None, None],
+            "float32",
+            name="trg_slf_attn_bias"),
+        Input(
+            [None, args.n_head, None, None],
+            "float32",
+            name="trg_src_attn_bias"),
    ]
    labels = [
        Input(
@@ -97,33 +104,38 @@ def do_train(args):
    data_files = [args.training_file, args.validation_file
                  ] if args.validation_file else [args.training_file]
    for i, data_file in enumerate(data_files):
-        dataset = Seq2SeqDataset(fpattern=data_file,
-                                 src_vocab_fpath=args.src_vocab_fpath,
-                                 trg_vocab_fpath=args.trg_vocab_fpath,
-                                 token_delimiter=args.token_delimiter,
-                                 start_mark=args.special_token[0],
-                                 end_mark=args.special_token[1],
-                                 unk_mark=args.special_token[2])
+        dataset = Seq2SeqDataset(
+            fpattern=data_file,
+            src_vocab_fpath=args.src_vocab_fpath,
+            trg_vocab_fpath=args.trg_vocab_fpath,
+            token_delimiter=args.token_delimiter,
+            start_mark=args.special_token[0],
+            end_mark=args.special_token[1],
+            unk_mark=args.special_token[2])
        args.src_vocab_size, args.trg_vocab_size, args.bos_idx, args.eos_idx, \
            args.unk_idx = dataset.get_vocab_summary()
-        batch_sampler = Seq2SeqBatchSampler(dataset=dataset,
-                                            use_token_batch=args.use_token_batch,
-                                            batch_size=args.batch_size,
-                                            pool_size=args.pool_size,
-                                            sort_type=args.sort_type,
-                                            shuffle=args.shuffle,
-                                            shuffle_batch=args.shuffle_batch,
-                                            max_length=args.max_length)
-        data_loader = DataLoader(dataset=dataset,
-                                batch_sampler=batch_sampler,
-                                places=device,
-                                feed_list=[x.forward() for x in inputs + labels],
-                                collate_fn=partial(prepare_train_input,
-                                                    src_pad_idx=args.eos_idx,
-                                                    trg_pad_idx=args.eos_idx,
-                                                    n_head=args.n_head),
-                                num_workers=0,
-                                return_list=True)
+        batch_sampler = Seq2SeqBatchSampler(
+            dataset=dataset,
+            use_token_batch=args.use_token_batch,
+            batch_size=args.batch_size,
+            pool_size=args.pool_size,
+            sort_type=args.sort_type,
+            shuffle=args.shuffle,
+            shuffle_batch=args.shuffle_batch,
+            max_length=args.max_length)
+        data_loader = DataLoader(
+            dataset=dataset,
+            batch_sampler=batch_sampler,
+            places=device,
+            feed_list=None if fluid.in_dygraph_mode() else
+            [x.forward() for x in inputs + labels],
+            collate_fn=partial(
+                prepare_train_input,
+                src_pad_idx=args.eos_idx,
+                trg_pad_idx=args.eos_idx,
+                n_head=args.n_head),
+            num_workers=0,  # TODO: use multi-process
+            return_list=True)
        data_loaders[i] = data_loader
    train_loader, eval_loader = data_loaders

@@ -135,15 +147,17 @@ def do_train(args):
        args.relu_dropout, args.preprocess_cmd, args.postprocess_cmd,
        args.weight_sharing, args.bos_idx, args.eos_idx)

-    transformer.prepare(fluid.optimizer.Adam(
-        learning_rate=fluid.layers.noam_decay(args.d_model, args.warmup_steps),
-        beta1=args.beta1,
-        beta2=args.beta2,
-        epsilon=float(args.eps),
-        parameter_list=transformer.parameters()),
-                        CrossEntropyCriterion(args.label_smooth_eps),
-                        inputs=inputs,
-                        labels=labels)
+    transformer.prepare(
+        fluid.optimizer.Adam(
+            learning_rate=fluid.layers.noam_decay(args.d_model,
+                                                  args.warmup_steps),
+            beta1=args.beta1,
+            beta2=args.beta2,
+            epsilon=float(args.eps),
+            parameter_list=transformer.parameters()),
+        CrossEntropyCriterion(args.label_smooth_eps),
+        inputs=inputs,
+        labels=labels)

    ## init from some checkpoint, to resume the previous training
    if args.init_from_checkpoint:

--- a/transformer/transformer.py
+++ b/transformer/transformer.py
@@ -21,6 +21,7 @@ import paddle.fluid.layers as layers
 from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer, to_variable
 from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
 from model import Model, CrossEntropy, Loss
+from text import TransformerBeamSearchDecoder, DynamicDecode


 def position_encoding_init(n_position, d_pos_vec):
@@ -604,9 +605,6 @@ class Transformer(Model):
        return predict


-from rnn_api import TransformerBeamSearchDecoder, DynamicDecode
-
-
 class TransfomerCell(object):
    """
    Let inputs=(trg_word, trg_pos), states=cache to make Transformer can be