Update text.py and Transformer.

94872ce6 · guosheng · 57365421 · 57365421 · 57365421 · 57365421
11 changed file
--- a/seq2seq/args.py
+++ b/seq2seq/args.py
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import argparse
-import distutils.util
-def parse_args():
-    parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument("--train_data_prefix",
-                        type=str,
-                        help="file prefix for train data")
-    parser.add_argument("--eval_data_prefix",
-                        type=str,
-                        help="file prefix for eval data")
-    parser.add_argument("--test_data_prefix",
-                        type=str,
-                        help="file prefix for test data")
-    parser.add_argument("--vocab_prefix",
-                        type=str,
-                        help="file prefix for vocab")
-    parser.add_argument("--src_lang", type=str, help="source language suffix")
-    parser.add_argument("--tar_lang", type=str, help="target language suffix")
-    parser.add_argument("--attention",
-                        type=eval,
-                        default=False,
-                        help="Whether use attention model")
-    parser.add_argument("--optimizer",
-                        type=str,
-                        default='adam',
-                        help="optimizer to use, only supprt[sgd|adam]")
-    parser.add_argument("--learning_rate",
-                        type=float,
-                        default=0.001,
-                        help="learning rate for optimizer")
-    parser.add_argument("--num_layers",
-                        type=int,
-                        default=1,
-                        help="layers number of encoder and decoder")
-    parser.add_argument("--hidden_size",
-                        type=int,
-                        default=100,
-                        help="hidden size of encoder and decoder")
-    parser.add_argument("--src_vocab_size", type=int, help="source vocab size")
-    parser.add_argument("--tar_vocab_size", type=int, help="target vocab size")
-    parser.add_argument("--batch_size",
-                        type=int,
-                        help="batch size of each step")
-    parser.add_argument("--max_epoch",
-                        type=int,
-                        default=12,
-                        help="max epoch for the training")
-    parser.add_argument("--max_len",
-                        type=int,
-                        default=50,
-                        help="max length for source and target sentence")
-    parser.add_argument("--dropout",
-                        type=float,
-                        default=0.0,
-                        help="drop probability")
-    parser.add_argument("--init_scale",
-                        type=float,
-                        default=0.0,
-                        help="init scale for parameter")
-    parser.add_argument("--max_grad_norm",
-                        type=float,
-                        default=5.0,
-                        help="max grad norm for global norm clip")
-    parser.add_argument("--model_path",
-                        type=str,
-                        default='model',
-                        help="model path for model to save")
-    parser.add_argument("--reload_model",
-                        type=str,
-                        help="reload model to inference")
-    parser.add_argument("--infer_file",
-                        type=str,
-                        help="file name for inference")
-    parser.add_argument("--infer_output_file",
-                        type=str,
-                        default='infer_output',
-                        help="file name for inference output")
-    parser.add_argument("--beam_size",
-                        type=int,
-                        default=10,
-                        help="file name for inference")
-    parser.add_argument('--use_gpu',
-                        type=eval,
-                        default=False,
-                        help='Whether using gpu [True|False]')
-    parser.add_argument('--eager_run',
-                        type=eval,
-                        default=False,
-                        help='Whether to use dygraph')
-    parser.add_argument("--enable_ce",
-                        action='store_true',
-                        help="The flag indicating whether to run the task "
-                        "for continuous evaluation.")
-    parser.add_argument("--profile",
-                        action='store_true',
-                        help="Whether enable the profile.")
-    # NOTE: profiler args, used for benchmark
-    parser.add_argument(
-        "--profiler_path",
-        type=str,
-        default='./seq2seq.profile',
-        help="the profiler output file path. (used for benchmark)")
-    args = parser.parse_args()
-    return args
--- a/seq2seq/configure.py
+++ b/seq2seq/configure.py
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import os
-import sys
-import argparse
-import json
-import yaml
-import six
-import logging
-logging_only_message = "%(message)s"
-logging_details = "%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s"
-class JsonConfig(object):
-    """
-    A high-level api for handling json configure file.
-    """
-    def __init__(self, config_path):
-        self._config_dict = self._parse(config_path)
-    def _parse(self, config_path):
-        try:
-            with open(config_path) as json_file:
-                config_dict = json.load(json_file)
-        except:
-            raise IOError("Error in parsing bert model config file '%s'" %
-                          config_path)
-        else:
-            return config_dict
-    def __getitem__(self, key):
-        return self._config_dict[key]
-    def print_config(self):
-        for arg, value in sorted(six.iteritems(self._config_dict)):
-            print('%s: %s' % (arg, value))
-        print('------------------------------------------------')
-class ArgumentGroup(object):
-    def __init__(self, parser, title, des):
-        self._group = parser.add_argument_group(title=title, description=des)
-    def add_arg(self, name, type, default, help, **kwargs):
-        type = str2bool if type == bool else type
-        self._group.add_argument(
-            "--" + name,
-            default=default,
-            type=type,
-            help=help + ' Default: %(default)s.',
-            **kwargs)
-class ArgConfig(object):
-    """
-    A high-level api for handling argument configs.
-    """
-    def __init__(self):
-        parser = argparse.ArgumentParser()
-        train_g = ArgumentGroup(parser, "training", "training options.")
-        train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.")
-        train_g.add_arg("learning_rate", float, 5e-5,
-                        "Learning rate used to train with warmup.")
-        train_g.add_arg(
-            "lr_scheduler",
-            str,
-            "linear_warmup_decay",
-            "scheduler of learning rate.",
-            choices=['linear_warmup_decay', 'noam_decay'])
-        train_g.add_arg("weight_decay", float, 0.01,
-                        "Weight decay rate for L2 regularizer.")
-        train_g.add_arg(
-            "warmup_proportion", float, 0.1,
-            "Proportion of training steps to perform linear learning rate warmup for."
-        )
-        train_g.add_arg("save_steps", int, 1000,
-                        "The steps interval to save checkpoints.")
-        train_g.add_arg("use_fp16", bool, False,
-                        "Whether to use fp16 mixed precision training.")
-        train_g.add_arg(
-            "loss_scaling", float, 1.0,
-            "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled."
-        )
-        train_g.add_arg("pred_dir", str, None,
-                        "Path to save the prediction results")
-        log_g = ArgumentGroup(parser, "logging", "logging related.")
-        log_g.add_arg("skip_steps", int, 10,
-                      "The steps interval to print loss.")
-        log_g.add_arg("verbose", bool, False, "Whether to output verbose log.")
-        run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
-        run_type_g.add_arg("use_cuda", bool, True,
-                           "If set, use GPU for training.")
-        run_type_g.add_arg(
-            "use_fast_executor", bool, False,
-            "If set, use fast parallel executor (in experiment).")
-        run_type_g.add_arg(
-            "num_iteration_per_drop_scope", int, 1,
-            "Ihe iteration intervals to clean up temporary variables.")
-        run_type_g.add_arg("do_train", bool, True,
-                           "Whether to perform training.")
-        run_type_g.add_arg("do_predict", bool, True,
-                           "Whether to perform prediction.")
-        custom_g = ArgumentGroup(parser, "customize", "customized options.")
-        self.custom_g = custom_g
-        self.parser = parser
-    def add_arg(self, name, dtype, default, descrip):
-        self.custom_g.add_arg(name, dtype, default, descrip)
-    def build_conf(self):
-        return self.parser.parse_args()
-def str2bool(v):
-    # because argparse does not support to parse "true, False" as python
-    # boolean directly
-    return v.lower() in ("true", "t", "1")
-def print_arguments(args, log=None):
-    if not log:
-        print('-----------  Configuration Arguments -----------')
-        for arg, value in sorted(six.iteritems(vars(args))):
-            print('%s: %s' % (arg, value))
-        print('------------------------------------------------')
-    else:
-        log.info('-----------  Configuration Arguments -----------')
-        for arg, value in sorted(six.iteritems(vars(args))):
-            log.info('%s: %s' % (arg, value))
-        log.info('------------------------------------------------')
-class PDConfig(object):
-    """
-    A high-level API for managing configuration files in PaddlePaddle.
-    Can jointly work with command-line-arugment, json files and yaml files.
-    """
-    def __init__(self, json_file="", yaml_file="", fuse_args=True):
-        """
-            Init funciton for PDConfig.
-            json_file: the path to the json configure file.
-            yaml_file: the path to the yaml configure file.
-            fuse_args: if fuse the json/yaml configs with argparse.
-        """
-        assert isinstance(json_file, str)
-        assert isinstance(yaml_file, str)
-        if json_file != "" and yaml_file != "":
-            raise Warning(
-                "json_file and yaml_file can not co-exist for now. please only use one configure file type."
-            )
-            return
-        self.args = None
-        self.arg_config = {}
-        self.json_config = {}
-        self.yaml_config = {}
-        parser = argparse.ArgumentParser()
-        self.default_g = ArgumentGroup(parser, "default", "default options.")
-        self.yaml_g = ArgumentGroup(parser, "yaml", "options from yaml.")
-        self.json_g = ArgumentGroup(parser, "json", "options from json.")
-        self.com_g = ArgumentGroup(parser, "custom", "customized options.")
-        self.default_g.add_arg("do_train", bool, False,
-                               "Whether to perform training.")
-        self.default_g.add_arg("do_predict", bool, False,
-                               "Whether to perform predicting.")
-        self.default_g.add_arg("do_eval", bool, False,
-                               "Whether to perform evaluating.")
-        self.default_g.add_arg("do_save_inference_model", bool, False,
-                               "Whether to perform model saving for inference.")
-        # NOTE: args for profiler
-        self.default_g.add_arg("is_profiler", int, 0, "the switch of profiler tools. (used for benchmark)")
-        self.default_g.add_arg("profiler_path", str, './', "the profiler output file path. (used for benchmark)")
-        self.default_g.add_arg("max_iter", int, 0, "the max train batch num.(used for benchmark)")
-        self.parser = parser
-        if json_file != "":
-            self.load_json(json_file, fuse_args=fuse_args)
-        if yaml_file:
-            self.load_yaml(yaml_file, fuse_args=fuse_args)
-    def load_json(self, file_path, fuse_args=True):
-        if not os.path.exists(file_path):
-            raise Warning("the json file %s does not exist." % file_path)
-            return
-        with open(file_path, "r") as fin:
-            self.json_config = json.loads(fin.read())
-            fin.close()
-        if fuse_args:
-            for name in self.json_config:
-                if isinstance(self.json_config[name], list):
-                    self.json_g.add_arg(
-                        name,
-                        type(self.json_config[name][0]),
-                        self.json_config[name],
-                        "This is from %s" % file_path,
-                        nargs=len(self.json_config[name]))
-                    continue
-                if not isinstance(self.json_config[name], int) \
-                    and not isinstance(self.json_config[name], float) \
-                    and not isinstance(self.json_config[name], str) \
-                    and not isinstance(self.json_config[name], bool):
-                    continue
-                self.json_g.add_arg(name,
-                                    type(self.json_config[name]),
-                                    self.json_config[name],
-                                    "This is from %s" % file_path)
-    def load_yaml(self, file_path, fuse_args=True):
-        if not os.path.exists(file_path):
-            raise Warning("the yaml file %s does not exist." % file_path)
-            return
-        with open(file_path, "r") as fin:
-            self.yaml_config = yaml.load(fin, Loader=yaml.SafeLoader)
-            fin.close()
-        if fuse_args:
-            for name in self.yaml_config:
-                if isinstance(self.yaml_config[name], list):
-                    self.yaml_g.add_arg(
-                        name,
-                        type(self.yaml_config[name][0]),
-                        self.yaml_config[name],
-                        "This is from %s" % file_path,
-                        nargs=len(self.yaml_config[name]))
-                    continue
-                if not isinstance(self.yaml_config[name], int) \
-                    and not isinstance(self.yaml_config[name], float) \
-                    and not isinstance(self.yaml_config[name], str) \
-                    and not isinstance(self.yaml_config[name], bool):
-                    continue
-                self.yaml_g.add_arg(name,
-                                    type(self.yaml_config[name]),
-                                    self.yaml_config[name],
-                                    "This is from %s" % file_path)
-    def build(self):
-        self.args = self.parser.parse_args()
-        self.arg_config = vars(self.args)
-    def __add__(self, new_arg):
-        assert isinstance(new_arg, list) or isinstance(new_arg, tuple)
-        assert len(new_arg) >= 3
-        assert self.args is None
-        name = new_arg[0]
-        dtype = new_arg[1]
-        dvalue = new_arg[2]
-        desc = new_arg[3] if len(
-            new_arg) == 4 else "Description is not provided."
-        self.com_g.add_arg(name, dtype, dvalue, desc)
-        return self
-    def __getattr__(self, name):
-        if name in self.arg_config:
-            return self.arg_config[name]
-        if name in self.json_config:
-            return self.json_config[name]
-        if name in self.yaml_config:
-            return self.yaml_config[name]
-        raise Warning("The argument %s is not defined." % name)
-    def Print(self):
-        print("-" * 70)
-        for name in self.arg_config:
-            print("%s:\t\t\t\t%s" % (str(name), str(self.arg_config[name])))
-        for name in self.json_config:
-            if name not in self.arg_config:
-                print("%s:\t\t\t\t%s" %
-                      (str(name), str(self.json_config[name])))
-        for name in self.yaml_config:
-            if name not in self.arg_config:
-                print("%s:\t\t\t\t%s" %
-                      (str(name), str(self.yaml_config[name])))
-        print("-" * 70)
-if __name__ == "__main__":
-    """
-    pd_config = PDConfig(json_file = "./test/bert_config.json")
-    pd_config.build()
-    print(pd_config.do_train)
-    print(pd_config.hidden_size)
-    pd_config = PDConfig(yaml_file = "./test/bert_config.yaml")
-    pd_config.build()
-    print(pd_config.do_train)
-    print(pd_config.hidden_size)
-    """
-    pd_config = PDConfig(yaml_file="./test/bert_config.yaml")
-    pd_config += ("my_age", int, 18, "I am forever 18.")
-    pd_config.build()
-    print(pd_config.do_train)
-    print(pd_config.hidden_size)
-    print(pd_config.my_age)
--- a/seq2seq/reader.py
+++ b/seq2seq/reader.py
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import glob
-import six
-import os
-import tarfile
-import itertools
-import numpy as np
-import paddle.fluid as fluid
-from paddle.fluid.dygraph.parallel import ParallelEnv
-from paddle.fluid.io import BatchSampler, DataLoader, Dataset
-def prepare_train_input(insts, src_pad_idx, trg_pad_idx):
-    """
-    Put all padded data needed by training into a list.
-    """
-    src, src_length = pad_batch_data([inst[0] for inst in insts], src_pad_idx)
-    trg, trg_length = pad_batch_data([inst[1] for inst in insts], trg_pad_idx)
-    label, _ = pad_batch_data([inst[2] for inst in insts], trg_pad_idx)
-    return src, src_length, trg, trg_length, np.expand_dims(label, -1)
-def pad_batch_data(insts, pad_idx):
-    """
-    Pad the instances to the max sequence length in batch, and generate the
-    corresponding position data and attention bias.
-    """
-    inst_length = np.array([len(inst) for inst in insts], dtype="int64")
-    max_len = np.max(inst_length)
-    inst_data = np.array(
-        [inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
-    return inst_data, inst_length
-class SortType(object):
-    GLOBAL = 'global'
-    POOL = 'pool'
-    NONE = "none"
-class Converter(object):
-    def __init__(self, vocab, beg, end, unk, delimiter, add_beg):
-        self._vocab = vocab
-        self._beg = beg
-        self._end = end
-        self._unk = unk
-        self._delimiter = delimiter
-        self._add_beg = add_beg
-    def __call__(self, sentence):
-        return ([self._beg] if self._add_beg else []) + [
-            self._vocab.get(w, self._unk)
-            for w in sentence.split(self._delimiter)
-        ] + [self._end]
-class ComposedConverter(object):
-    def __init__(self, converters):
-        self._converters = converters
-    def __call__(self, parallel_sentence):
-        return [
-            self._converters[i](parallel_sentence[i])
-            for i in range(len(self._converters))
-        ]
-class SentenceBatchCreator(object):
-    def __init__(self, batch_size):
-        self.batch = []
-        self._batch_size = batch_size
-    def append(self, info):
-        self.batch.append(info)
-        if len(self.batch) == self._batch_size:
-            tmp = self.batch
-            self.batch = []
-            return tmp
-class TokenBatchCreator(object):
-    def __init__(self, batch_size):
-        self.batch = []
-        self.max_len = -1
-        self._batch_size = batch_size
-    def append(self, info):
-        cur_len = info.max_len
-        max_len = max(self.max_len, cur_len)
-        if max_len * (len(self.batch) + 1) > self._batch_size:
-            result = self.batch
-            self.batch = [info]
-            self.max_len = cur_len
-            return result
-        else:
-            self.max_len = max_len
-            self.batch.append(info)
-class SampleInfo(object):
-    def __init__(self, i, max_len, min_len):
-        self.i = i
-        self.min_len = min_len
-        self.max_len = max_len
-class MinMaxFilter(object):
-    def __init__(self, max_len, min_len, underlying_creator):
-        self._min_len = min_len
-        self._max_len = max_len
-        self._creator = underlying_creator
-    def append(self, info):
-        if info.max_len > self._max_len or info.min_len < self._min_len:
-            return
-        else:
-            return self._creator.append(info)
-    @property
-    def batch(self):
-        return self._creator.batch
-class Seq2SeqDataset(Dataset):
-    def __init__(self,
-                 src_vocab_fpath,
-                 trg_vocab_fpath,
-                 fpattern,
-                 tar_fname=None,
-                 field_delimiter="\t",
-                 token_delimiter=" ",
-                 start_mark="<s>",
-                 end_mark="<e>",
-                 unk_mark="<unk>",
-                 only_src=False):
-        # convert str to bytes, and use byte data
-        field_delimiter = field_delimiter.encode("utf8")
-        token_delimiter = token_delimiter.encode("utf8")
-        start_mark = start_mark.encode("utf8")
-        end_mark = end_mark.encode("utf8")
-        unk_mark = unk_mark.encode("utf8")
-        self._src_vocab = self.load_dict(src_vocab_fpath)
-        self._trg_vocab = self.load_dict(trg_vocab_fpath)
-        self._bos_idx = self._src_vocab[start_mark]
-        self._eos_idx = self._src_vocab[end_mark]
-        self._unk_idx = self._src_vocab[unk_mark]
-        self._only_src = only_src
-        self._field_delimiter = field_delimiter
-        self._token_delimiter = token_delimiter
-        self.load_src_trg_ids(fpattern, tar_fname)
-    def load_src_trg_ids(self, fpattern, tar_fname):
-        converters = [
-            Converter(vocab=self._src_vocab,
-                      beg=self._bos_idx,
-                      end=self._eos_idx,
-                      unk=self._unk_idx,
-                      delimiter=self._token_delimiter,
-                      add_beg=False)
-        ]
-        if not self._only_src:
-            converters.append(
-                Converter(vocab=self._trg_vocab,
-                          beg=self._bos_idx,
-                          end=self._eos_idx,
-                          unk=self._unk_idx,
-                          delimiter=self._token_delimiter,
-                          add_beg=True))
-        converters = ComposedConverter(converters)
-        self._src_seq_ids = []
-        self._trg_seq_ids = None if self._only_src else []
-        self._sample_infos = []
-        for i, line in enumerate(self._load_lines(fpattern, tar_fname)):
-            src_trg_ids = converters(line)
-            self._src_seq_ids.append(src_trg_ids[0])
-            lens = [len(src_trg_ids[0])]
-            if not self._only_src:
-                self._trg_seq_ids.append(src_trg_ids[1])
-                lens.append(len(src_trg_ids[1]))
-            self._sample_infos.append(SampleInfo(i, max(lens), min(lens)))
-    def _load_lines(self, fpattern, tar_fname):
-        fpaths = glob.glob(fpattern)
-        assert len(fpaths) > 0, "no matching file to the provided data path"
-        if len(fpaths) == 1 and tarfile.is_tarfile(fpaths[0]):
-            if tar_fname is None:
-                raise Exception("If tar file provided, please set tar_fname.")
-            f = tarfile.open(fpaths[0], "rb")
-            for line in f.extractfile(tar_fname):
-                fields = line.strip(b"\n").split(self._field_delimiter)
-                if (not self._only_src
-                        and len(fields) == 2) or (self._only_src
-                                                  and len(fields) == 1):
-                    yield fields
-        else:
-            for fpath in fpaths:
-                if not os.path.isfile(fpath):
-                    raise IOError("Invalid file: %s" % fpath)
-                with open(fpath, "rb") as f:
-                    for line in f:
-                        fields = line.strip(b"\n").split(self._field_delimiter)
-                        if (not self._only_src and len(fields) == 2) or (
-                                self._only_src and len(fields) == 1):
-                            yield fields
-    @staticmethod
-    def load_dict(dict_path, reverse=False):
-        word_dict = {}
-        with open(dict_path, "rb") as fdict:
-            for idx, line in enumerate(fdict):
-                if reverse:
-                    word_dict[idx] = line.strip(b"\n")
-                else:
-                    word_dict[line.strip(b"\n")] = idx
-        return word_dict
-    def get_vocab_summary(self):
-        return len(self._src_vocab), len(
-            self._trg_vocab), self._bos_idx, self._eos_idx, self._unk_idx
-    def __getitem__(self, idx):
-        return (self._src_seq_ids[idx], self._trg_seq_ids[idx][:-1],
-                self._trg_seq_ids[idx][1:]
-                ) if not self._only_src else self._src_seq_ids[idx]
-    def __len__(self):
-        return len(self._sample_infos)
-class Seq2SeqBatchSampler(BatchSampler):
-    def __init__(self,
-                 dataset,
-                 batch_size,
-                 pool_size,
-                 sort_type=SortType.GLOBAL,
-                 min_length=0,
-                 max_length=100,
-                 shuffle=True,
-                 shuffle_batch=False,
-                 use_token_batch=False,
-                 clip_last_batch=False,
-                 seed=0):
-        for arg, value in locals().items():
-            if arg != "self":
-                setattr(self, "_" + arg, value)
-        self._random = np.random
-        self._random.seed(seed)
-        # for multi-devices
-        self._nranks = ParallelEnv().nranks
-        self._local_rank = ParallelEnv().local_rank
-        self._device_id = ParallelEnv().dev_id
-    def __iter__(self):
-        # global sort or global shuffle
-        if self._sort_type == SortType.GLOBAL:
-            infos = sorted(self._dataset._sample_infos,
-                           key=lambda x: x.max_len)
-        else:
-            if self._shuffle:
-                infos = self._dataset._sample_infos
-                self._random.shuffle(infos)
-            else:
-                infos = self._dataset._sample_infos
-            if self._sort_type == SortType.POOL:
-                reverse = True
-                for i in range(0, len(infos), self._pool_size):
-                    # to avoid placing short next to long sentences
-                    reverse = not reverse
-                    infos[i:i + self._pool_size] = sorted(
-                        infos[i:i + self._pool_size],
-                        key=lambda x: x.max_len,
-                        reverse=reverse)
-        batches = []
-        batch_creator = TokenBatchCreator(
-            self._batch_size
-        ) if self._use_token_batch else SentenceBatchCreator(self._batch_size *
-                                                             self._nranks)
-        batch_creator = MinMaxFilter(self._max_length, self._min_length,
-                                     batch_creator)
-        for info in infos:
-            batch = batch_creator.append(info)
-            if batch is not None:
-                batches.append(batch)
-        if not self._clip_last_batch and len(batch_creator.batch) != 0:
-            batches.append(batch_creator.batch)
-        if self._shuffle_batch:
-            self._random.shuffle(batches)
-        if not self._use_token_batch:
-            # when producing batches according to sequence number, to confirm
-            # neighbor batches which would be feed and run parallel have similar
-            # length (thus similar computational cost) after shuffle, we as take
-            # them as a whole when shuffling and split here
-            batches = [[
-                batch[self._batch_size * i:self._batch_size * (i + 1)]
-                for i in range(self._nranks)
-            ] for batch in batches]
-            batches = itertools.chain.from_iterable(batches)
-        # for multi-device
-        for batch_id, batch in enumerate(batches):
-            if batch_id % self._nranks == self._local_rank:
-                batch_indices = [info.i for info in batch]
-                yield batch_indices
-        if self._local_rank > len(batches) % self._nranks:
-            yield batch_indices
-    def __len__(self):
-        return 100
--- a/seq2seq/seq2seq.py
+++ b/seq2seq/seq2seq.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-from paddle.fluid import ParamAttr
-from paddle.fluid.initializer import UniformInitializer
-from paddle.fluid.dygraph import Embedding, Linear, Layer
-from rnn_api import DynamicDecode, RNN, BasicLSTMCell, RNNCell
-from model import Model, Loss
-class CrossEntropyCriterion(Loss):
-    def __init__(self):
-        super(CrossEntropyCriterion, self).__init__()
-    def forward(self, outputs, labels):
-        (predict, mask), label = outputs, labels[0]
-        cost = layers.softmax_with_cross_entropy(logits=predict,
-                                                 label=label,
-                                                 soft_label=False)
-        masked_cost = layers.elementwise_mul(cost, mask, axis=0)
-        batch_mean_cost = layers.reduce_mean(masked_cost, dim=[0])
-        seq_cost = layers.reduce_sum(batch_mean_cost)
-        return seq_cost
-class EncoderCell(RNNCell):
-    def __init__(self,
-                 num_layers,
-                 input_size,
-                 hidden_size,
-                 dropout_prob=0.,
-                 init_scale=0.1):
-        super(EncoderCell, self).__init__()
-        self.dropout_prob = dropout_prob
-        # use add_sublayer to add multi-layers
-        self.lstm_cells = []
-        for i in range(num_layers):
-            self.lstm_cells.append(
-                self.add_sublayer(
-                    "lstm_%d" % i,
-                    BasicLSTMCell(
-                        input_size=input_size if i == 0 else hidden_size,
-                        hidden_size=hidden_size,
-                        param_attr=ParamAttr(initializer=UniformInitializer(
-                            low=-init_scale, high=init_scale)))))
-    def forward(self, step_input, states):
-        new_states = []
-        for i, lstm_cell in enumerate(self.lstm_cells):
-            out, new_state = lstm_cell(step_input, states[i])
-            step_input = layers.dropout(
-                out, self.dropout_prob) if self.dropout_prob > 0 else out
-            new_states.append(new_state)
-        return step_input, new_states
-    @property
-    def state_shape(self):
-        return [cell.state_shape for cell in self.lstm_cells]
-class Encoder(Layer):
-    def __init__(self,
-                 vocab_size,
-                 embed_dim,
-                 hidden_size,
-                 num_layers,
-                 dropout_prob=0.,
-                 init_scale=0.1):
-        super(Encoder, self).__init__()
-        self.embedder = Embedding(
-            size=[vocab_size, embed_dim],
-            param_attr=ParamAttr(initializer=UniformInitializer(
-                low=-init_scale, high=init_scale)))
-        self.stack_lstm = RNN(EncoderCell(num_layers, embed_dim, hidden_size,
-                                          init_scale),
-                              is_reverse=False,
-                              time_major=False)
-    def forward(self, sequence, sequence_length):
-        inputs = self.embedder(sequence)
-        encoder_output, encoder_state = self.stack_lstm(
-            inputs, sequence_length=sequence_length)
-        return encoder_output, encoder_state
-class AttentionLayer(Layer):
-    def __init__(self, hidden_size, bias=False, init_scale=0.1):
-        super(AttentionLayer, self).__init__()
-        self.input_proj = Linear(
-            hidden_size,
-            hidden_size,
-            param_attr=ParamAttr(initializer=UniformInitializer(
-                low=-init_scale, high=init_scale)),
-            bias_attr=bias)
-        self.output_proj = Linear(
-            hidden_size + hidden_size,
-            hidden_size,
-            param_attr=ParamAttr(initializer=UniformInitializer(
-                low=-init_scale, high=init_scale)),
-            bias_attr=bias)
-    def forward(self, hidden, encoder_output, encoder_padding_mask):
-        query = self.input_proj(hidden)
-        attn_scores = layers.matmul(layers.unsqueeze(query, [1]),
-                                    encoder_output,
-                                    transpose_y=True)
-        if encoder_padding_mask is not None:
-            attn_scores = layers.elementwise_add(attn_scores,
-                                                 encoder_padding_mask)
-        attn_scores = layers.softmax(attn_scores)
-        attn_out = layers.squeeze(layers.matmul(attn_scores, encoder_output),
-                                  [1])
-        attn_out = layers.concat([attn_out, hidden], 1)
-        attn_out = self.output_proj(attn_out)
-        return attn_out
-class DecoderCell(RNNCell):
-    def __init__(self,
-                 num_layers,
-                 input_size,
-                 hidden_size,
-                 dropout_prob=0.,
-                 init_scale=0.1):
-        super(DecoderCell, self).__init__()
-        self.dropout_prob = dropout_prob
-        # use add_sublayer to add multi-layers
-        self.lstm_cells = []
-        for i in range(num_layers):
-            self.lstm_cells.append(
-                self.add_sublayer(
-                    "lstm_%d" % i,
-                    BasicLSTMCell(input_size=input_size +
-                                  hidden_size if i == 0 else hidden_size,
-                                  hidden_size=hidden_size)))
-        self.attention_layer = AttentionLayer(hidden_size)
-    def forward(self,
-                step_input,
-                states,
-                encoder_output,
-                encoder_padding_mask=None):
-        lstm_states, input_feed = states
-        new_lstm_states = []
-        step_input = layers.concat([step_input, input_feed], 1)
-        for i, lstm_cell in enumerate(self.lstm_cells):
-            out, new_lstm_state = lstm_cell(step_input, lstm_states[i])
-            step_input = layers.dropout(
-                out, self.dropout_prob) if self.dropout_prob > 0 else out
-            new_lstm_states.append(new_lstm_state)
-        out = self.attention_layer(step_input, encoder_output,
-                                   encoder_padding_mask)
-        return out, [new_lstm_states, out]
-class Decoder(Layer):
-    def __init__(self,
-                 vocab_size,
-                 embed_dim,
-                 hidden_size,
-                 num_layers,
-                 dropout_prob=0.,
-                 init_scale=0.1):
-        super(Decoder, self).__init__()
-        self.embedder = Embedding(
-            size=[vocab_size, embed_dim],
-            param_attr=ParamAttr(initializer=UniformInitializer(
-                low=-init_scale, high=init_scale)))
-        self.lstm_attention = RNN(DecoderCell(num_layers, embed_dim,
-                                              hidden_size, init_scale),
-                                  is_reverse=False,
-                                  time_major=False)
-        self.output_layer = Linear(
-            hidden_size,
-            vocab_size,
-            param_attr=ParamAttr(initializer=UniformInitializer(
-                low=-init_scale, high=init_scale)),
-            bias_attr=False)
-    def forward(self, target, decoder_initial_states, encoder_output,
-                encoder_padding_mask):
-        inputs = self.embedder(target)
-        decoder_output, _ = self.lstm_attention(
-            inputs,
-            decoder_initial_states,
-            encoder_output=encoder_output,
-            encoder_padding_mask=encoder_padding_mask)
-        predict = self.output_layer(decoder_output)
-        return predict
-class Seq2Seq(Model):
-    def __init__(self,
-                 src_vocab_size,
-                 trg_vocab_size,
-                 embed_dim,
-                 hidden_size,
-                 num_layers,
-                 dropout_prob=0.,
-                 init_scale=0.1):
-        super(Seq2Seq, self).__init__()
-        self.hidden_size = hidden_size
-        self.encoder = Encoder(src_vocab_size, embed_dim, hidden_size,
-                               num_layers, dropout_prob, init_scale)
-        self.decoder = Decoder(trg_vocab_size, embed_dim, hidden_size,
-                               num_layers, dropout_prob, init_scale)
-    def forward(self, src, src_length, trg, trg_length):
-        # encoder
-        encoder_output, encoder_final_state = self.encoder(src, src_length)
-        # decoder initial states: use input_feed and the structure is
-        # [[h,c] * num_layers, input_feed]
-        decoder_initial_states = [
-            encoder_final_state,
-            self.decoder.lstm_attention.cell.get_initial_states(
-                batch_ref=encoder_output, shape=[self.hidden_size])
-        ]
-        # attention mask to avoid paying attention on padddings
-        src_mask = layers.sequence_mask(src_length,
-                                        maxlen=layers.shape(src)[1],
-                                        dtype=encoder_output.dtype)
-        encoder_padding_mask = (src_mask - 1.0) * 1e9
-        encoder_padding_mask = layers.unsqueeze(encoder_padding_mask, [1])
-        # decoder with attentioon
-        predict = self.decoder(trg, decoder_initial_states, encoder_output,
-                               encoder_padding_mask)
-        # for target padding mask
-        mask = layers.sequence_mask(trg_length,
-                                    maxlen=layers.shape(trg)[1],
-                                    dtype=predict.dtype)
-        return predict, mask
-class Seq2SeqInferModel(Seq2Seq):
-    def __init__(self,
-                 vocab_size,
-                 embed_dim,
-                 hidden_size,
-                 num_layers,
-                 dropout_prob=0.):
-        pass
--- a/seq2seq/seq2seq.yaml
+++ b/seq2seq/seq2seq.yaml
-# used for continuous evaluation
-enable_ce: False
-eager_run: False
-# The frequency to save trained models when training.
-save_step: 10000
-# The frequency to fetch and print output when training.
-print_step: 100
-# path of the checkpoint, to resume the previous training
-init_from_checkpoint: ""
-# path of the pretrain model, to better solve the current task
-init_from_pretrain_model: ""
-# path of trained parameter, to make prediction
-init_from_params: "trained_params/step_100000/"
-# the directory for saving model
-save_model: "trained_models"
-# the directory for saving inference model.
-inference_model_dir: "infer_model"
-# Set seed for CE or debug
-random_seed: None
-# The pattern to match training data files.
-training_file: "wmt16_ende_data_bpe/train.tok.clean.bpe.32000.en-de"
-# The pattern to match validation data files.
-validation_file: "wmt16_ende_data_bpe/newstest2014.tok.bpe.32000.en-de"
-# The pattern to match test data files.
-predict_file: "wmt16_ende_data_bpe/newstest2016.tok.bpe.32000.en-de"
-# The file to output the translation results of predict_file to.
-output_file: "predict.txt"
-# The path of vocabulary file of source language.
-src_vocab_fpath: "wmt16_ende_data_bpe/vocab_all.bpe.32000"
-# The path of vocabulary file of target language.
-trg_vocab_fpath: "wmt16_ende_data_bpe/vocab_all.bpe.32000"
-# The <bos>, <eos> and <unk> tokens in the dictionary.
-special_token: ["<s>", "<e>", "<unk>"]
-# max length of sequences
-max_length: 256
-# whether to use cuda
-use_cuda: True
-# args for reader, see reader.py for details
-token_delimiter: " "
-use_token_batch: True
-pool_size: 200000
-sort_type: "pool"
-shuffle: True
-shuffle_batch: True
-batch_size: 4096
-# Hyparams for training:
-# the number of epoches for training
-epoch: 30
-# the hyper parameters for Adam optimizer.
-# This static learning_rate will be multiplied to the LearningRateScheduler
-# derived learning rate the to get the final learning rate.
-learning_rate: 0.001
-# Hyparams for generation:
-# the parameters for beam search.
-beam_size: 5
-max_out_len: 256
-# the number of decoded sentences to output.
-n_best: 1
-# Hyparams for model:
-# These following five vocabularies related configurations will be set
-# automatically according to the passed vocabulary path and special tokens.
-# size of source word dictionary.
-src_vocab_size: 10000
-# size of target word dictionay
-trg_vocab_size: 10000
-# index for <bos> token
-bos_idx: 0
-# index for <eos> token
-eos_idx: 1
-# index for <unk> token
-unk_idx: 2
-embed_dim: 512
-hidden_size: 512
-num_layers: 2
-dropout: 0.1
--- a/seq2seq/train.py
+++ b/seq2seq/train.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import logging
-import os
-import six
-import sys
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-import time
-import contextlib
-from functools import partial
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid.dygraph import to_variable
-from paddle.fluid.io import DataLoader
-from configure import PDConfig
-from reader import prepare_train_input, Seq2SeqDataset, Seq2SeqBatchSampler
-from seq2seq import Seq2Seq, CrossEntropyCriterion
-from model import Input, set_device
-from callbacks import ProgBarLogger
-class LoggerCallback(ProgBarLogger):
-    def __init__(self, log_freq=1, verbose=2, loss_normalizer=0.):
-        super(LoggerCallback, self).__init__(log_freq, verbose)
-        # TODO: wrap these override function to simplify
-        self.loss_normalizer = loss_normalizer
-    def on_train_begin(self, logs=None):
-        super(LoggerCallback, self).on_train_begin(logs)
-        self.train_metrics += ["normalized loss", "ppl"]
-    def on_train_batch_end(self, step, logs=None):
-        logs["normalized loss"] = logs["loss"][0] - self.loss_normalizer
-        logs["ppl"] = np.exp(min(logs["loss"][0], 100))
-        super(LoggerCallback, self).on_train_batch_end(step, logs)
-    def on_eval_begin(self, logs=None):
-        super(LoggerCallback, self).on_eval_begin(logs)
-        self.eval_metrics += ["normalized loss", "ppl"]
-    def on_eval_batch_end(self, step, logs=None):
-        logs["normalized loss"] = logs["loss"][0] - self.loss_normalizer
-        logs["ppl"] = np.exp(min(logs["loss"][0], 100))
-        super(LoggerCallback, self).on_eval_batch_end(step, logs)
-def do_train(args):
-    device = set_device("gpu" if args.use_cuda else "cpu")
-    fluid.enable_dygraph(device) if args.eager_run else None
-    # set seed for CE
-    random_seed = eval(str(args.random_seed))
-    if random_seed is not None:
-        fluid.default_main_program().random_seed = random_seed
-        fluid.default_startup_program().random_seed = random_seed
-    # define model
-    inputs = [
-        Input([None, None], "int64", name="src_word"),
-        Input([None], "int64", name="src_length"),
-        Input([None, None], "int64", name="trg_word"),
-        Input([None], "int64", name="trg_length"),
-    ]
-    labels = [
-        Input([None, None, 1], "int64", name="label"),
-    ]
-    model = Seq2Seq(args.src_vocab_size, args.trg_vocab_size, args.embed_dim,
-                    args.hidden_size, args.num_layers, args.dropout)
-    model.prepare(fluid.optimizer.Adam(learning_rate=args.learning_rate,
-                                       parameter_list=model.parameters()),
-                  CrossEntropyCriterion(),
-                  inputs=inputs,
-                  labels=labels)
-    batch_size = 32
-    src_seq_len = 10
-    trg_seq_len = 12
-    iter_num = 10
-    def random_generator():
-        for i in range(iter_num):
-            src = np.random.randint(2, args.src_vocab_size,
-                                    (batch_size, src_seq_len)).astype("int64")
-            src_length = np.random.randint(
-                1, src_seq_len, (batch_size, )).astype("int64")
-            trg = np.random.randint(2, args.trg_vocab_size,
-                                    (batch_size, trg_seq_len)).astype("int64")
-            trg_length = np.random.randint(1, trg_seq_len,
-                                        (batch_size, )).astype("int64")
-            label = np.random.randint(1, trg_seq_len,
-                                    (batch_size, trg_seq_len, 1)).astype("int64")
-            yield src, src_length, trg, trg_length, label
-    model.fit(train_data=random_generator, log_freq=1)
-    exit(0)
-    dataset = Seq2SeqDataset(fpattern=args.training_file,
-                             src_vocab_fpath=args.src_vocab_fpath,
-                             trg_vocab_fpath=args.trg_vocab_fpath,
-                             token_delimiter=args.token_delimiter,
-                             start_mark=args.special_token[0],
-                             end_mark=args.special_token[1],
-                             unk_mark=args.special_token[2])
-    args.src_vocab_size, args.trg_vocab_size, args.bos_idx, args.eos_idx, \
-        args.unk_idx = dataset.get_vocab_summary()
-    batch_sampler = Seq2SeqBatchSampler(dataset=dataset,
-                                        use_token_batch=args.use_token_batch,
-                                        batch_size=args.batch_size,
-                                        pool_size=args.pool_size,
-                                        sort_type=args.sort_type,
-                                        shuffle=args.shuffle,
-                                        shuffle_batch=args.shuffle_batch,
-                                        max_length=args.max_length)
-    train_loader = DataLoader(dataset=dataset,
-                              batch_sampler=batch_sampler,
-                              places=device,
-                              feed_list=[x.forward() for x in inputs + labels],
-                              collate_fn=partial(prepare_train_input,
-                                                 src_pad_idx=args.eos_idx,
-                                                 trg_pad_idx=args.eos_idx),
-                              num_workers=0,
-                              return_list=True)
-    model.fit(train_data=train_loader,
-              eval_data=None,
-              epochs=1,
-              eval_freq=1,
-              save_freq=1,
-              verbose=2,
-              callbacks=[
-                  LoggerCallback(log_freq=args.print_step)
-              ])
-if __name__ == "__main__":
-    args = PDConfig(yaml_file="./seq2seq.yaml")
-    args.build()
-    args.Print()
-    do_train(args)
--- a/rnn_api.py
+++ b/rnn_api.py
@@ -8,11 +8,19 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers.utils as utils
 from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
-from paddle.fluid.dygraph import to_variable, Embedding, Linear
+from paddle.fluid.dygraph import to_variable, Embedding, Linear, LayerNorm
 from paddle.fluid.data_feeder import convert_dtype
 from paddle.fluid import layers
 from paddle.fluid.dygraph import Layer
+from paddle.layers import BeamSearchDecoder
+__all__ = [
+    'RNNCell', 'BasicLSTMCell', 'BasicGRUCell', 'RNN', 'DynamicDecode',
+    'BeamSearchDecoder', 'MultiHeadAttention', 'FFN',
+    'TransformerEncoderLayer', 'TransformerEncoder', 'TransformerDecoderLayer',
+    'TransformerDecoder', 'TransformerBeamSearchDecoder'
+]
 class RNNCell(Layer):
@@ -307,11 +315,13 @@ class BasicGRUCell(RNNCell):
            gate_bias_attr = self._bias_attr
            candidate_bias_attr = self._bias_attr
-        self._gate_bias = self.create_parameter(attr=gate_bias_attr,
+        self._gate_bias = self.create_parameter(
+            attr=gate_bias_attr,
            shape=[2 * self._hiden_size],
            dtype=self._dtype,
            is_bias=True)
-        self._candidate_bias = self.create_parameter(attr=candidate_bias_attr,
+        self._candidate_bias = self.create_parameter(
+            attr=candidate_bias_attr,
            shape=[self._hiden_size],
            dtype=self._dtype,
            is_bias=True)
@@ -329,8 +339,8 @@ class BasicGRUCell(RNNCell):
        r_hidden = r * pre_hidden
-        candidate = layers.matmul(layers.concat([input, r_hidden], 1),
+        candidate = layers.matmul(
-                                  self._candidate_weight)
+            layers.concat([input, r_hidden], 1), self._candidate_weight)
        candidate = layers.elementwise_add(candidate, self._candidate_bias)
        c = self._activation(candidate)
@@ -643,3 +653,340 @@ class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
                                 beam_search_state.finished)
        return (beam_search_output, beam_search_state, next_inputs, finished)
+### Transformer Modules ###
+class PrePostProcessLayer(Layer):
+    """
+    PrePostProcessLayer
+    """
+    def __init__(self, process_cmd, d_model, dropout_rate):
+        super(PrePostProcessLayer, self).__init__()
+        self.process_cmd = process_cmd
+        self.functors = []
+        for cmd in self.process_cmd:
+            if cmd == "a":  # add residual connection
+                self.functors.append(lambda x, y: x + y if y else x)
+            elif cmd == "n":  # add layer normalization
+                self.functors.append(
+                    self.add_sublayer(
+                        "layer_norm_%d" % len(
+                            self.sublayers(include_sublayers=False)),
+                        LayerNorm(
+                            normalized_shape=d_model,
+                            param_attr=fluid.ParamAttr(
+                                initializer=fluid.initializer.Constant(1.)),
+                            bias_attr=fluid.ParamAttr(
+                                initializer=fluid.initializer.Constant(0.)))))
+            elif cmd == "d":  # add dropout
+                self.functors.append(lambda x: layers.dropout(
+                    x, dropout_prob=dropout_rate, is_test=False)
+                                     if dropout_rate else x)
+    def forward(self, x, residual=None):
+        for i, cmd in enumerate(self.process_cmd):
+            if cmd == "a":
+                x = self.functors[i](x, residual)
+            else:
+                x = self.functors[i](x)
+        return x
+class MultiHeadAttention(Layer):
+    """
+    Multi-Head Attention
+    """
+    def __init__(self, d_key, d_value, d_model, n_head=1, dropout_rate=0.):
+        super(MultiHeadAttention, self).__init__()
+        self.n_head = n_head
+        self.d_key = d_key
+        self.d_value = d_value
+        self.d_model = d_model
+        self.dropout_rate = dropout_rate
+        self.q_fc = Linear(
+            input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
+        self.k_fc = Linear(
+            input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
+        self.v_fc = Linear(
+            input_dim=d_model, output_dim=d_value * n_head, bias_attr=False)
+        self.proj_fc = Linear(
+            input_dim=d_value * n_head, output_dim=d_model, bias_attr=False)
+    def _prepare_qkv(self, queries, keys, values, cache=None):
+        if keys is None:  # self-attention
+            keys, values = queries, queries
+            static_kv = False
+        else:  # cross-attention
+            static_kv = True
+        q = self.q_fc(queries)
+        q = layers.reshape(x=q, shape=[0, 0, self.n_head, self.d_key])
+        q = layers.transpose(x=q, perm=[0, 2, 1, 3])
+        if cache is not None and static_kv and "static_k" in cache:
+            # for encoder-decoder attention in inference and has cached
+            k = cache["static_k"]
+            v = cache["static_v"]
+        else:
+            k = self.k_fc(keys)
+            v = self.v_fc(values)
+            k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
+            k = layers.transpose(x=k, perm=[0, 2, 1, 3])
+            v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
+            v = layers.transpose(x=v, perm=[0, 2, 1, 3])
+        if cache is not None:
+            if static_kv and not "static_k" in cache:
+                # for encoder-decoder attention in inference and has not cached
+                cache["static_k"], cache["static_v"] = k, v
+            elif not static_kv:
+                # for decoder self-attention in inference
+                cache_k, cache_v = cache["k"], cache["v"]
+                k = layers.concat([cache_k, k], axis=2)
+                v = layers.concat([cache_v, v], axis=2)
+                cache["k"], cache["v"] = k, v
+        return q, k, v
+    def forward(self, queries, keys, values, attn_bias, cache=None):
+        # compute q ,k ,v
+        q, k, v = self._prepare_qkv(queries, keys, values, cache)
+        # scale dot product attention
+        product = layers.matmul(
+            x=q, y=k, transpose_y=True, alpha=self.d_model**-0.5)
+        if attn_bias:
+            product += attn_bias
+        weights = layers.softmax(product)
+        if self.dropout_rate:
+            weights = layers.dropout(
+                weights, dropout_prob=self.dropout_rate, is_test=False)
+        out = layers.matmul(weights, v)
+        # combine heads
+        out = layers.transpose(out, perm=[0, 2, 1, 3])
+        out = layers.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
+        # project to output
+        out = self.proj_fc(out)
+        return out
+    def cal_kv(self, keys, values):
+        k = self.k_fc(keys)
+        v = self.v_fc(values)
+        k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
+        k = layers.transpose(x=k, perm=[0, 2, 1, 3])
+        v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
+        v = layers.transpose(x=v, perm=[0, 2, 1, 3])
+        return k, v
+class FFN(Layer):
+    """
+    Feed-Forward Network
+    """
+    def __init__(self, d_inner_hid, d_model, dropout_rate):
+        super(FFN, self).__init__()
+        self.dropout_rate = dropout_rate
+        self.fc1 = Linear(
+            input_dim=d_model, output_dim=d_inner_hid, act="relu")
+        self.fc2 = Linear(input_dim=d_inner_hid, output_dim=d_model)
+    def forward(self, x):
+        hidden = self.fc1(x)
+        if self.dropout_rate:
+            hidden = layers.dropout(
+                hidden, dropout_prob=self.dropout_rate, is_test=False)
+        out = self.fc2(hidden)
+        return out
+class TransformerEncoderLayer(Layer):
+    """
+    EncoderLayer
+    """
+    def __init__(self,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da"):
+        super(TransformerEncoderLayer, self).__init__()
+        self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout)
+        self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
+                                            attention_dropout)
+        self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model,
+                                                  prepostprocess_dropout)
+        self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout)
+        self.ffn = FFN(d_inner_hid, d_model, relu_dropout)
+        self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
+                                                  prepostprocess_dropout)
+    def forward(self, enc_input, attn_bias):
+        attn_output = self.self_attn(
+            self.preprocesser1(enc_input), None, None, attn_bias)
+        attn_output = self.postprocesser1(attn_output, enc_input)
+        ffn_output = self.ffn(self.preprocesser2(attn_output))
+        ffn_output = self.postprocesser2(ffn_output, attn_output)
+        return ffn_output
+class TransformerEncoder(Layer):
+    """
+    encoder
+    """
+    def __init__(self,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da"):
+        super(TransformerEncoder, self).__init__()
+        self.encoder_layers = list()
+        for i in range(n_layer):
+            self.encoder_layers.append(
+                self.add_sublayer(
+                    "layer_%d" % i,
+                    TransformerEncoderLayer(
+                        n_head, d_key, d_value, d_model, d_inner_hid,
+                        prepostprocess_dropout, attention_dropout,
+                        relu_dropout, preprocess_cmd, postprocess_cmd)))
+        self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
+                                             prepostprocess_dropout)
+    def forward(self, enc_input, attn_bias):
+        for encoder_layer in self.encoder_layers:
+            enc_output = encoder_layer(enc_input, attn_bias)
+            enc_input = enc_output
+        return self.processer(enc_output)
+class TransformerDecoderLayer(Layer):
+    """
+    decoder
+    """
+    def __init__(self,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da"):
+        super(TransformerDecoderLayer, self).__init__()
+        self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout)
+        self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
+                                            attention_dropout)
+        self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model,
+                                                  prepostprocess_dropout)
+        self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout)
+        self.cross_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
+                                             attention_dropout)
+        self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
+                                                  prepostprocess_dropout)
+        self.preprocesser3 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout)
+        self.ffn = FFN(d_inner_hid, d_model, relu_dropout)
+        self.postprocesser3 = PrePostProcessLayer(postprocess_cmd, d_model,
+                                                  prepostprocess_dropout)
+    def forward(self,
+                dec_input,
+                enc_output,
+                self_attn_bias,
+                cross_attn_bias,
+                cache=None):
+        self_attn_output = self.self_attn(
+            self.preprocesser1(dec_input), None, None, self_attn_bias, cache)
+        self_attn_output = self.postprocesser1(self_attn_output, dec_input)
+        cross_attn_output = self.cross_attn(
+            self.preprocesser2(self_attn_output), enc_output, enc_output,
+            cross_attn_bias, cache)
+        cross_attn_output = self.postprocesser2(cross_attn_output,
+                                                self_attn_output)
+        ffn_output = self.ffn(self.preprocesser3(cross_attn_output))
+        ffn_output = self.postprocesser3(ffn_output, cross_attn_output)
+        return ffn_output
+class TransformerDecoder(Layer):
+    """
+    decoder
+    """
+    def __init__(self, n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
+                 prepostprocess_dropout, attention_dropout, relu_dropout,
+                 preprocess_cmd, postprocess_cmd):
+        super(TransformerDecoder, self).__init__()
+        self.decoder_layers = list()
+        for i in range(n_layer):
+            self.decoder_layers.append(
+                self.add_sublayer(
+                    "layer_%d" % i,
+                    TransformerDecoderLayer(
+                        n_head, d_key, d_value, d_model, d_inner_hid,
+                        prepostprocess_dropout, attention_dropout,
+                        relu_dropout, preprocess_cmd, postprocess_cmd)))
+        self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
+                                             prepostprocess_dropout)
+    def forward(self,
+                dec_input,
+                enc_output,
+                self_attn_bias,
+                cross_attn_bias,
+                caches=None):
+        for i, decoder_layer in enumerate(self.decoder_layers):
+            dec_output = decoder_layer(dec_input, enc_output, self_attn_bias,
+                                       cross_attn_bias, None
+                                       if caches is None else caches[i])
+            dec_input = dec_output
+        return self.processer(dec_output)
+    def prepare_static_cache(self, enc_output):
+        return [
+            dict(
+                zip(("static_k", "static_v"),
+                    decoder_layer.cross_attn.cal_kv(enc_output, enc_output)))
+            for decoder_layer in self.decoder_layers
+        ]
--- a/transformer/predict.py
+++ b/transformer/predict.py
@@ -55,18 +55,23 @@ def do_predict(args):
    fluid.enable_dygraph(device) if args.eager_run else None
    inputs = [
-        Input([None, None], "int64", name="src_word"),
+        Input(
-        Input([None, None], "int64", name="src_pos"),
+            [None, None], "int64", name="src_word"),
-        Input([None, args.n_head, None, None],
+        Input(
+            [None, None], "int64", name="src_pos"),
+        Input(
+            [None, args.n_head, None, None],
            "float32",
            name="src_slf_attn_bias"),
-        Input([None, args.n_head, None, None],
+        Input(
+            [None, args.n_head, None, None],
            "float32",
            name="trg_src_attn_bias"),
    ]
    # define data
-    dataset = Seq2SeqDataset(fpattern=args.predict_file,
+    dataset = Seq2SeqDataset(
+        fpattern=args.predict_file,
        src_vocab_fpath=args.src_vocab_fpath,
        trg_vocab_fpath=args.trg_vocab_fpath,
        token_delimiter=args.token_delimiter,
@@ -75,24 +80,27 @@ def do_predict(args):
        unk_mark=args.special_token[2])
    args.src_vocab_size, args.trg_vocab_size, args.bos_idx, args.eos_idx, \
        args.unk_idx = dataset.get_vocab_summary()
-    trg_idx2word = Seq2SeqDataset.load_dict(dict_path=args.trg_vocab_fpath,
+    trg_idx2word = Seq2SeqDataset.load_dict(
-                                            reverse=True)
+        dict_path=args.trg_vocab_fpath, reverse=True)
-    batch_sampler = Seq2SeqBatchSampler(dataset=dataset,
+    batch_sampler = Seq2SeqBatchSampler(
+        dataset=dataset,
        use_token_batch=False,
        batch_size=args.batch_size,
        max_length=args.max_length)
-    data_loader = DataLoader(dataset=dataset,
+    data_loader = DataLoader(
+        dataset=dataset,
        batch_sampler=batch_sampler,
        places=device,
-                             feed_list=[x.forward() for x in inputs],
+        feed_list=None
-                             collate_fn=partial(prepare_infer_input,
+        if fluid.in_dygraph_mode() else [x.forward() for x in inputs],
-                                                src_pad_idx=args.eos_idx,
+        collate_fn=partial(
-                                                n_head=args.n_head),
+            prepare_infer_input, src_pad_idx=args.eos_idx, n_head=args.n_head),
        num_workers=0,
        return_list=True)
    # define model
-    transformer = InferTransformer(args.src_vocab_size,
+    transformer = InferTransformer(
+        args.src_vocab_size,
        args.trg_vocab_size,
        args.max_length + 1,
        args.n_layer,
@@ -126,8 +134,7 @@ def do_predict(args):
        for ins in finished_seq:
            for beam_idx, beam in enumerate(ins):
                if beam_idx >= args.n_best: break
-                id_list = post_process_seq(beam, args.bos_idx,
+                id_list = post_process_seq(beam, args.bos_idx, args.eos_idx)
-                                           args.eos_idx)
                word_list = [trg_idx2word[id] for id in id_list]
                sequence = b" ".join(word_list) + b"\n"
                f.write(sequence)

--- a/transformer/rnn_api.py
+++ b/transformer/rnn_api.py
-import collections
-import contextlib
-import inspect
-import six
-import sys
-from functools import partial, reduce
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.layers.utils as utils
-from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
-from paddle.fluid.dygraph import to_variable, Embedding, Linear
-from paddle.fluid.data_feeder import convert_dtype
-from paddle.fluid import layers
-from paddle.fluid.dygraph import Layer
-class RNNUnit(Layer):
-    def get_initial_states(self,
-                           batch_ref,
-                           shape=None,
-                           dtype=None,
-                           init_value=0,
-                           batch_dim_idx=0):
-        """
-        Generate initialized states according to provided shape, data type and
-        value.
-        Parameters:
-            batch_ref: A (possibly nested structure of) tensor variable[s].
-                The first dimension of the tensor will be used as batch size to
-                initialize states.
-            shape: A (possiblely nested structure of) shape[s], where a shape is
-                represented as a list/tuple of integer). -1(for batch size) will
-                beautomatically inserted if shape is not started with it. If None,
-                property `state_shape` will be used. The default value is None.
-            dtype: A (possiblely nested structure of) data type[s]. The structure
-                must be same as that of `shape`, except when all tensors' in states
-                has the same data type, a single data type can be used. If None and
-                property `cell.state_shape` is not available, float32 will be used
-                as the data type. The default value is None.
-            init_value: A float value used to initialize states.
-        Returns:
-            Variable: tensor variable[s] packed in the same structure provided \
-                by shape, representing the initialized states.
-        """
-        # TODO: use inputs and batch_size
-        batch_ref = flatten(batch_ref)[0]
-        def _is_shape_sequence(seq):
-            if sys.version_info < (3, ):
-                integer_types = (
-                    int,
-                    long, )
-            else:
-                integer_types = (int, )
-            """For shape, list/tuple of integer is the finest-grained objection"""
-            if (isinstance(seq, list) or isinstance(seq, tuple)):
-                if reduce(
-                        lambda flag, x: isinstance(x, integer_types) and flag,
-                        seq, True):
-                    return False
-            # TODO: Add check for the illegal
-            if isinstance(seq, dict):
-                return True
-            return (isinstance(seq, collections.Sequence) and
-                    not isinstance(seq, six.string_types))
-        class Shape(object):
-            def __init__(self, shape):
-                self.shape = shape if shape[0] == -1 else ([-1] + list(shape))
-        # nested structure of shapes
-        states_shapes = self.state_shape if shape is None else shape
-        is_sequence_ori = utils.is_sequence
-        utils.is_sequence = _is_shape_sequence
-        states_shapes = map_structure(lambda shape: Shape(shape),
-                                      states_shapes)
-        utils.is_sequence = is_sequence_ori
-        # nested structure of dtypes
-        try:
-            states_dtypes = self.state_dtype if dtype is None else dtype
-        except NotImplementedError:  # use fp32 as default
-            states_dtypes = "float32"
-        if len(flatten(states_dtypes)) == 1:
-            dtype = flatten(states_dtypes)[0]
-            states_dtypes = map_structure(lambda shape: dtype, states_shapes)
-        init_states = map_structure(
-            lambda shape, dtype: fluid.layers.fill_constant_batch_size_like(
-                input=batch_ref,
-                shape=shape.shape,
-                dtype=dtype,
-                value=init_value,
-                input_dim_idx=batch_dim_idx), states_shapes, states_dtypes)
-        return init_states
-    @property
-    def state_shape(self):
-        """
-        Abstract method (property).
-        Used to initialize states.
-        A (possiblely nested structure of) shape[s], where a shape is represented
-        as a list/tuple of integers (-1 for batch size would be automatically
-        inserted into a shape if shape is not started with it).
-        Not necessary to be implemented if states are not initialized by
-        `get_initial_states` or the `shape` argument is provided when using
-        `get_initial_states`.
-        """
-        raise NotImplementedError(
-            "Please add implementaion for `state_shape` in the used cell.")
-    @property
-    def state_dtype(self):
-        """
-        Abstract method (property).
-        Used to initialize states.
-        A (possiblely nested structure of) data types[s]. The structure must be
-        same as that of `shape`, except when all tensors' in states has the same
-        data type, a signle data type can be used.
-        Not necessary to be implemented if states are not initialized
-        by `get_initial_states` or the `dtype` argument is provided when using
-        `get_initial_states`.
-        """
-        raise NotImplementedError(
-            "Please add implementaion for `state_dtype` in the used cell.")
-class BasicLSTMUnit(RNNUnit):
-    """
-    ****
-    BasicLSTMUnit class, Using basic operator to build LSTM
-    The algorithm can be described as the code below.
-        .. math::
-           i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + b_i)
-           f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + b_f + forget_bias )
-           o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + b_o)
-           \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + b_c)
-           c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t}
-           h_t &= o_t \odot tanh(c_t)
-        - $W$ terms denote weight matrices (e.g. $W_{ix}$ is the matrix
-          of weights from the input gate to the input)
-        - The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
-        - sigmoid is the logistic sigmoid function.
-        - $i, f, o$ and $c$ are the input gate, forget gate, output gate,
-          and cell activation vectors, respectively, all of which have the same size as
-          the cell output activation vector $h$.
-        - The :math:`\odot` is the element-wise product of the vectors.
-        - :math:`tanh` is the activation functions.
-        - :math:`\\tilde{c_t}` is also called candidate hidden state,
-          which is computed based on the current input and the previous hidden state.
-    Args:
-        name_scope(string) : The name scope used to identify parameter and bias name
-        hidden_size (integer): The hidden size used in the Unit.
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-            weight matrix. Note:
-            If it is set to None or one attribute of ParamAttr, lstm_unit will
-            create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|None): The parameter attribute for the bias
-            of LSTM unit.
-            If it is set to None or one attribute of ParamAttr, lstm_unit will
-            create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized as zero. Default: None.
-        gate_activation (function|None): The activation function for gates (actGate).
-                                  Default: 'fluid.layers.sigmoid'
-        activation (function|None): The activation function for cells (actNode).
-                             Default: 'fluid.layers.tanh'
-        forget_bias(float|1.0): forget bias used when computing forget gate
-        dtype(string): data type used in this unit
-    """
-    def __init__(self,
-                 hidden_size,
-                 input_size,
-                 param_attr=None,
-                 bias_attr=None,
-                 gate_activation=None,
-                 activation=None,
-                 forget_bias=1.0,
-                 dtype='float32'):
-        super(BasicLSTMUnit, self).__init__(dtype)
-        self._hidden_size = hidden_size
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._gate_activation = gate_activation or layers.sigmoid
-        self._activation = activation or layers.tanh
-        self._forget_bias = layers.fill_constant(
-            [1], dtype=dtype, value=forget_bias)
-        self._forget_bias.stop_gradient = False
-        self._dtype = dtype
-        self._input_size = input_size
-        self._weight = self.create_parameter(
-            attr=self._param_attr,
-            shape=[
-                self._input_size + self._hidden_size, 4 * self._hidden_size
-            ],
-            dtype=self._dtype)
-        self._bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[4 * self._hidden_size],
-            dtype=self._dtype,
-            is_bias=True)
-    def forward(self, input, state):
-        pre_hidden, pre_cell = state
-        concat_input_hidden = layers.concat([input, pre_hidden], 1)
-        gate_input = layers.matmul(x=concat_input_hidden, y=self._weight)
-        gate_input = layers.elementwise_add(gate_input, self._bias)
-        i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
-        new_cell = layers.elementwise_add(
-            layers.elementwise_mul(
-                pre_cell,
-                layers.sigmoid(layers.elementwise_add(f, self._forget_bias))),
-            layers.elementwise_mul(layers.sigmoid(i), layers.tanh(j)))
-        new_hidden = layers.tanh(new_cell) * layers.sigmoid(o)
-        return new_hidden, [new_hidden, new_cell]
-    @property
-    def state_shape(self):
-        return [[self._hidden_size], [self._hidden_size]]
-class RNN(fluid.dygraph.Layer):
-    def __init__(self, cell, is_reverse=False, time_major=False):
-        super(RNN, self).__init__()
-        self.cell = cell
-        if not hasattr(self.cell, "call"):
-            self.cell.call = self.cell.forward
-        self.is_reverse = is_reverse
-        self.time_major = time_major
-        self.batch_index, self.time_step_index = (1, 0) if time_major else (0,
-                                                                            1)
-    def forward(self,
-                inputs,
-                initial_states=None,
-                sequence_length=None,
-                **kwargs):
-        if fluid.in_dygraph_mode():
-            class ArrayWrapper(object):
-                def __init__(self, x):
-                    self.array = [x]
-                def append(self, x):
-                    self.array.append(x)
-                    return self
-            def _maybe_copy(state, new_state, step_mask):
-                # TODO: use where_op
-                new_state = fluid.layers.elementwise_mul(
-                    new_state, step_mask,
-                    axis=0) - fluid.layers.elementwise_mul(
-                        state, (step_mask - 1), axis=0)
-                return new_state
-            flat_inputs = flatten(inputs)
-            batch_size, time_steps = (
-                flat_inputs[0].shape[self.batch_index],
-                flat_inputs[0].shape[self.time_step_index])
-            if initial_states is None:
-                initial_states = self.cell.get_initial_states(
-                    batch_ref=inputs, batch_dim_idx=self.batch_index)
-            if not self.time_major:
-                inputs = map_structure(
-                    lambda x: fluid.layers.transpose(x, [1, 0] + list(
-                        range(2, len(x.shape)))), inputs)
-            if sequence_length:
-                mask = fluid.layers.sequence_mask(
-                    sequence_length,
-                    maxlen=time_steps,
-                    dtype=flatten(initial_states)[0].dtype)
-                mask = fluid.layers.transpose(mask, [1, 0])
-            if self.is_reverse:
-                inputs = map_structure(
-                    lambda x: fluid.layers.reverse(x, axis=[0]), inputs)
-                mask = fluid.layers.reverse(
-                    mask, axis=[0]) if sequence_length else None
-            states = initial_states
-            outputs = []
-            for i in range(time_steps):
-                step_inputs = map_structure(lambda x: x[i], inputs)
-                step_outputs, new_states = self.cell(step_inputs, states,
-                                                     **kwargs)
-                if sequence_length:
-                    new_states = map_structure(
-                        partial(
-                            _maybe_copy, step_mask=mask[i]),
-                        states,
-                        new_states)
-                states = new_states
-                outputs = map_structure(
-                    lambda x: ArrayWrapper(x),
-                    step_outputs) if i == 0 else map_structure(
-                        lambda x, x_array: x_array.append(x), step_outputs,
-                        outputs)
-            final_outputs = map_structure(
-                lambda x: fluid.layers.stack(x.array,
-                                             axis=self.time_step_index),
-                outputs)
-            if self.is_reverse:
-                final_outputs = map_structure(
-                    lambda x: fluid.layers.reverse(x,
-                                                   axis=self.time_step_index),
-                    final_outputs)
-            final_states = new_states
-        else:
-            final_outputs, final_states = fluid.layers.rnn(
-                self.cell,
-                inputs,
-                initial_states=initial_states,
-                sequence_length=sequence_length,
-                time_major=self.time_major,
-                is_reverse=self.is_reverse,
-                **kwargs)
-        return final_outputs, final_states
-from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer, to_variable
-place = fluid.CPUPlace()
-executor = fluid.Executor(place)
-class EncoderCell(RNNUnit):
-    def __init__(self, num_layers, input_size, hidden_size, dropout_prob=0.):
-        super(EncoderCell, self).__init__()
-        self.num_layers = num_layers
-        self.dropout_prob = dropout_prob
-        self.lstm_cells = list()
-        for i in range(self.num_layers):
-            self.lstm_cells.append(
-                self.add_sublayer("layer_%d" % i,
-                                  BasicLSTMUnit(input_size if i == 0 else
-                                                hidden_size, hidden_size)))
-    def forward(self, step_input, states):
-        new_states = []
-        for i in range(self.num_layers):
-            out, new_state = self.lstm_cells[i](step_input, states[i])
-            step_input = layers.dropout(
-                out, self.dropout_prob) if self.dropout_prob > 0 else out
-            new_states.append(new_state)
-        return step_input, new_states
-    @property
-    def state_shape(self):
-        return [cell.state_shape for cell in self.lstm_cells]
-class MultiHeadAttention(Layer):
-    """
-    Multi-Head Attention
-    """
-    # def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None):
-    #     pass
-    # def forward(self, queries, keys, values, attn_bias, cache=None):
-    #     pass
-    def __init__(self, d_key, d_value, d_model, n_head=1, dropout_rate=0.):
-        super(MultiHeadAttention, self).__init__()
-        self.n_head = n_head
-        self.d_key = d_key
-        self.d_value = d_value
-        self.d_model = d_model
-        self.dropout_rate = dropout_rate
-        self.q_fc = Linear(
-            input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
-        self.k_fc = Linear(
-            input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
-        self.v_fc = Linear(
-            input_dim=d_model, output_dim=d_value * n_head, bias_attr=False)
-        self.proj_fc = Linear(
-            input_dim=d_value * n_head, output_dim=d_model, bias_attr=False)
-    def forward(self, queries, keys, values, attn_bias, cache=None):
-        # compute q ,k ,v
-        keys = queries if keys is None else keys
-        values = keys if values is None else values
-        q = self.q_fc(queries)
-        k = self.k_fc(keys)
-        v = self.v_fc(values)
-        # split head
-        q = layers.reshape(x=q, shape=[0, 0, self.n_head, self.d_key])
-        q = layers.transpose(x=q, perm=[0, 2, 1, 3])
-        k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
-        k = layers.transpose(x=k, perm=[0, 2, 1, 3])
-        v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
-        v = layers.transpose(x=v, perm=[0, 2, 1, 3])
-        if cache is not None:
-            cache_k, cache_v = cache["k"], cache["v"]
-            k = layers.concat([cache_k, k], axis=2)
-            v = layers.concat([cache_v, v], axis=2)
-            cache["k"], cache["v"] = k, v
-        # scale dot product attention
-        product = layers.matmul(
-            x=q, y=k, transpose_y=True, alpha=self.d_model**-0.5)
-        if attn_bias:
-            product += attn_bias
-        weights = layers.softmax(product)
-        if self.dropout_rate:
-            weights = layers.dropout(
-                weights, dropout_prob=self.dropout_rate, is_test=False)
-            out = layers.matmul(weights, v)
-        # combine heads
-        out = layers.transpose(out, perm=[0, 2, 1, 3])
-        out = layers.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
-        # project to output
-        out = self.proj_fc(out)
-        return out
-class DynamicDecode(Layer):
-    def __init__(self,
-                 decoder,
-                 max_step_num=None,
-                 output_time_major=False,
-                 impute_finished=False,
-                 is_test=False,
-                 return_length=False):
-        super(DynamicDecode, self).__init__()
-        self.decoder = decoder
-        self.max_step_num = max_step_num
-        self.output_time_major = output_time_major
-        self.impute_finished = impute_finished
-        self.is_test = is_test
-        self.return_length = return_length
-    def forward(self, inits=None, **kwargs):
-        if fluid.in_dygraph_mode():
-            class ArrayWrapper(object):
-                def __init__(self, x):
-                    self.array = [x]
-                def append(self, x):
-                    self.array.append(x)
-                    return self
-                def __getitem__(self, item):
-                    return self.array.__getitem__(item)
-            def _maybe_copy(state, new_state, step_mask):
-                # TODO: use where_op
-                state_dtype = state.dtype
-                if convert_dtype(state_dtype) in ["bool"]:
-                    state = layers.cast(state, dtype="float32")
-                    new_state = layers.cast(new_state, dtype="float32")
-                if step_mask.dtype != state.dtype:
-                    step_mask = layers.cast(step_mask, dtype=state.dtype)
-                    # otherwise, renamed bool gradients of would be summed up leading
-                    # to sum(bool) error.
-                    step_mask.stop_gradient = True
-                new_state = layers.elementwise_mul(
-                    state, step_mask, axis=0) - layers.elementwise_mul(
-                        new_state, (step_mask - 1), axis=0)
-                if convert_dtype(state_dtype) in ["bool"]:
-                    new_state = layers.cast(new_state, dtype=state_dtype)
-                return new_state
-            initial_inputs, initial_states, initial_finished = self.decoder.initialize(
-                inits)
-            inputs, states, finished = (initial_inputs, initial_states,
-                                        initial_finished)
-            cond = layers.logical_not((layers.reduce_all(initial_finished)))
-            sequence_lengths = layers.cast(
-                layers.zeros_like(initial_finished), "int64")
-            outputs = None
-            step_idx = 0
-            step_idx_tensor = layers.fill_constant(
-                shape=[1], dtype="int64", value=step_idx)
-            while cond.numpy():
-                (step_outputs, next_states, next_inputs,
-                 next_finished) = self.decoder.step(step_idx_tensor, inputs,
-                                                    states, **kwargs)
-                next_finished = layers.logical_or(next_finished, finished)
-                next_sequence_lengths = layers.elementwise_add(
-                    sequence_lengths,
-                    layers.cast(
-                        layers.logical_not(finished), sequence_lengths.dtype))
-                if self.impute_finished:  # rectify the states for the finished.
-                    next_states = map_structure(
-                        lambda x, y: _maybe_copy(x, y, finished), states,
-                        next_states)
-                outputs = map_structure(
-                    lambda x: ArrayWrapper(x),
-                    step_outputs) if step_idx == 0 else map_structure(
-                        lambda x, x_array: x_array.append(x), step_outputs,
-                        outputs)
-                inputs, states, finished, sequence_lengths = (
-                    next_inputs, next_states, next_finished,
-                    next_sequence_lengths)
-                layers.increment(x=step_idx_tensor, value=1.0, in_place=True)
-                step_idx += 1
-                layers.logical_not(layers.reduce_all(finished), cond)
-                if self.max_step_num is not None and step_idx > self.max_step_num:
-                    break
-            final_outputs = map_structure(
-                lambda x: fluid.layers.stack(x.array, axis=0), outputs)
-            final_states = states
-            try:
-                final_outputs, final_states = self.decoder.finalize(
-                    final_outputs, final_states, sequence_lengths)
-            except NotImplementedError:
-                pass
-            if not self.output_time_major:
-                final_outputs = map_structure(
-                    lambda x: layers.transpose(x, [1, 0] + list(
-                        range(2, len(x.shape)))), final_outputs)
-            return (final_outputs, final_states,
-                    sequence_lengths) if self.return_length else (
-                        final_outputs, final_states)
-        else:
-            return fluid.layers.dynamic_decode(
-                self.decoder,
-                inits,
-                max_step_num=self.max_step_num,
-                output_time_major=self.output_time_major,
-                impute_finished=self.impute_finished,
-                is_test=self.is_test,
-                return_length=self.return_length,
-                **kwargs)
-class TransfomerCell(object):
-    """
-    Let inputs=(trg_word, trg_pos), states=cache to make Transformer can be
-    used as RNNCell
-    """
-    def __init__(self, decoder):
-        self.decoder = decoder
-    def __call__(self, inputs, states, trg_src_attn_bias, enc_output,
-                 static_caches):
-        trg_word, trg_pos = inputs
-        for cache, static_cache in zip(states, static_caches):
-            cache.update(static_cache)
-        logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias,
-                              enc_output, states)
-        new_states = [{"k": cache["k"], "v": cache["v"]} for cache in states]
-        return logits, new_states
-class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
-    def __init__(self, cell, start_token, end_token, beam_size,
-                 var_dim_in_state):
-        super(TransformerBeamSearchDecoder,
-              self).__init__(cell, start_token, end_token, beam_size)
-        self.cell = cell
-        self.var_dim_in_state = var_dim_in_state
-    def _merge_batch_beams_with_var_dim(self, x):
-        # init length of cache is 0, and it increases with decoding carrying on,
-        # thus need to reshape elaborately
-        var_dim_in_state = self.var_dim_in_state + 1  # count in beam dim
-        x = layers.transpose(x,
-                             list(range(var_dim_in_state, len(x.shape))) +
-                             list(range(0, var_dim_in_state)))
-        x = layers.reshape(
-            x, [0] * (len(x.shape) - var_dim_in_state
-                      ) + [self.batch_size * self.beam_size] +
-            [int(size) for size in x.shape[-var_dim_in_state + 2:]])
-        x = layers.transpose(
-            x,
-            list(range((len(x.shape) + 1 - var_dim_in_state), len(x.shape))) +
-            list(range(0, (len(x.shape) + 1 - var_dim_in_state))))
-        return x
-    def _split_batch_beams_with_var_dim(self, x):
-        var_dim_size = layers.shape(x)[self.var_dim_in_state]
-        x = layers.reshape(
-            x, [-1, self.beam_size] +
-            [int(size)
-             for size in x.shape[1:self.var_dim_in_state]] + [var_dim_size] +
-            [int(size) for size in x.shape[self.var_dim_in_state + 1:]])
-        return x
-    def step(self, time, inputs, states, **kwargs):
-        # compared to RNN, Transformer has 3D data at every decoding step
-        inputs = layers.reshape(inputs, [-1, 1])  # token
-        pos = layers.ones_like(inputs) * time  # pos
-        cell_states = map_structure(self._merge_batch_beams_with_var_dim,
-                                    states.cell_states)
-        cell_outputs, next_cell_states = self.cell((inputs, pos), cell_states,
-                                                   **kwargs)
-        cell_outputs = map_structure(self._split_batch_beams, cell_outputs)
-        next_cell_states = map_structure(self._split_batch_beams_with_var_dim,
-                                         next_cell_states)
-        beam_search_output, beam_search_state = self._beam_search_step(
-            time=time,
-            logits=cell_outputs,
-            next_cell_states=next_cell_states,
-            beam_state=states)
-        next_inputs, finished = (beam_search_output.predicted_ids,
-                                 beam_search_state.finished)
-        return (beam_search_output, beam_search_state, next_inputs, finished)
--- a/transformer/train.py
+++ b/transformer/train.py
@@ -71,17 +71,24 @@ def do_train(args):
    # define inputs
    inputs = [
-        Input([None, None], "int64", name="src_word"),
+        Input(
-        Input([None, None], "int64", name="src_pos"),
+            [None, None], "int64", name="src_word"),
-        Input([None, args.n_head, None, None],
+        Input(
+            [None, None], "int64", name="src_pos"),
+        Input(
+            [None, args.n_head, None, None],
            "float32",
            name="src_slf_attn_bias"),
-        Input([None, None], "int64", name="trg_word"),
+        Input(
-        Input([None, None], "int64", name="trg_pos"),
+            [None, None], "int64", name="trg_word"),
-        Input([None, args.n_head, None, None],
+        Input(
+            [None, None], "int64", name="trg_pos"),
+        Input(
+            [None, args.n_head, None, None],
            "float32",
            name="trg_slf_attn_bias"),
-        Input([None, args.n_head, None, None],
+        Input(
+            [None, args.n_head, None, None],
            "float32",
            name="trg_src_attn_bias"),
    ]
@@ -97,7 +104,8 @@ def do_train(args):
    data_files = [args.training_file, args.validation_file
                  ] if args.validation_file else [args.training_file]
    for i, data_file in enumerate(data_files):
-        dataset = Seq2SeqDataset(fpattern=data_file,
+        dataset = Seq2SeqDataset(
+            fpattern=data_file,
            src_vocab_fpath=args.src_vocab_fpath,
            trg_vocab_fpath=args.trg_vocab_fpath,
            token_delimiter=args.token_delimiter,
@@ -106,7 +114,8 @@ def do_train(args):
            unk_mark=args.special_token[2])
        args.src_vocab_size, args.trg_vocab_size, args.bos_idx, args.eos_idx, \
            args.unk_idx = dataset.get_vocab_summary()
-        batch_sampler = Seq2SeqBatchSampler(dataset=dataset,
+        batch_sampler = Seq2SeqBatchSampler(
+            dataset=dataset,
            use_token_batch=args.use_token_batch,
            batch_size=args.batch_size,
            pool_size=args.pool_size,
@@ -114,15 +123,18 @@ def do_train(args):
            shuffle=args.shuffle,
            shuffle_batch=args.shuffle_batch,
            max_length=args.max_length)
-        data_loader = DataLoader(dataset=dataset,
+        data_loader = DataLoader(
+            dataset=dataset,
            batch_sampler=batch_sampler,
            places=device,
-                                feed_list=[x.forward() for x in inputs + labels],
+            feed_list=None if fluid.in_dygraph_mode() else
-                                collate_fn=partial(prepare_train_input,
+            [x.forward() for x in inputs + labels],
+            collate_fn=partial(
+                prepare_train_input,
                src_pad_idx=args.eos_idx,
                trg_pad_idx=args.eos_idx,
                n_head=args.n_head),
-                                num_workers=0,
+            num_workers=0,  # TODO: use multi-process
            return_list=True)
        data_loaders[i] = data_loader
    train_loader, eval_loader = data_loaders
@@ -135,8 +147,10 @@ def do_train(args):
        args.relu_dropout, args.preprocess_cmd, args.postprocess_cmd,
        args.weight_sharing, args.bos_idx, args.eos_idx)
-    transformer.prepare(fluid.optimizer.Adam(
+    transformer.prepare(
-        learning_rate=fluid.layers.noam_decay(args.d_model, args.warmup_steps),
+        fluid.optimizer.Adam(
+            learning_rate=fluid.layers.noam_decay(args.d_model,
+                                                  args.warmup_steps),
            beta1=args.beta1,
            beta2=args.beta2,
            epsilon=float(args.eps),

--- a/transformer/transformer.py
+++ b/transformer/transformer.py
@@ -21,6 +21,7 @@ import paddle.fluid.layers as layers
 from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer, to_variable
 from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
 from model import Model, CrossEntropy, Loss
+from text import TransformerBeamSearchDecoder, DynamicDecode
 def position_encoding_init(n_position, d_pos_vec):
@@ -604,9 +605,6 @@ class Transformer(Model):
        return predict
-from rnn_api import TransformerBeamSearchDecoder, DynamicDecode
 class TransfomerCell(object):
    """
    Let inputs=(trg_word, trg_pos), states=cache to make Transformer can be