diff --git a/PaddleNLP/PaddleMT/transformer/README.md b/PaddleNLP/PaddleMT/transformer/README.md
index 2d5e31a6ee16bda2b043d19b49371091596f3a1f..90d47f53cc4566bc5428d95e24ee43641e27c90b 100644
--- a/PaddleNLP/PaddleMT/transformer/README.md
+++ b/PaddleNLP/PaddleMT/transformer/README.md
@@ -39,7 +39,7 @@
     克隆代码库到本地
     ```shell
     git clone https://github.com/PaddlePaddle/models.git
-    cd models/PaddleNLP/neural_machine_translation/transformer
+    cd models/PaddleNLP/PaddleMT/transformer
     ```
 
 3. 环境依赖
diff --git a/PaddleNLP/PaddleMT/transformer/inference_model.py b/PaddleNLP/PaddleMT/transformer/inference_model.py
index d1b88f5be965f85eac1bc703ee10f7cca57bcb78..40fc7edeb229d3eb1cfbf4f6c4911b3716291efa 100644
--- a/PaddleNLP/PaddleMT/transformer/inference_model.py
+++ b/PaddleNLP/PaddleMT/transformer/inference_model.py
@@ -22,9 +22,8 @@ import numpy as np
 import paddle
 import paddle.fluid as fluid
 
-#include palm for easier nlp coding
-from palm.toolkit.input_field import InputField
-from palm.toolkit.configure import PDConfig
+from utils.input_field import InputField
+from utils.configure import PDConfig
 
 # include task-specific libs
 import desc
diff --git a/PaddleNLP/PaddleMT/transformer/main.py b/PaddleNLP/PaddleMT/transformer/main.py
index 6ff929af0e72296bc635a56d90d2c0925b5bad68..feaf29baeb386b7843651ff9fc4197861d702c66 100644
--- a/PaddleNLP/PaddleMT/transformer/main.py
+++ b/PaddleNLP/PaddleMT/transformer/main.py
@@ -20,13 +20,12 @@ import numpy as np
 import paddle
 import paddle.fluid as fluid
 
-#include palm for easier nlp coding
-from palm.toolkit.configure import PDConfig
-
+from utils.configure import PDConfig
 from train import do_train
 from predict import do_predict
 from inference_model import do_save_inference_model
 
+
 if __name__ == "__main__":
     LOG_FORMAT = "[%(asctime)s %(levelname)s %(filename)s:%(lineno)d] %(message)s"
     logging.basicConfig(
diff --git a/PaddleNLP/PaddleMT/transformer/transformer.py b/PaddleNLP/PaddleMT/transformer/transformer.py
index 3e2367c1156b06dad7daa5d1409139c3c01c7c2e..be20001b25fdb94fcc4bc234bae220413ddfacdd 100644
--- a/PaddleNLP/PaddleMT/transformer/transformer.py
+++ b/PaddleNLP/PaddleMT/transformer/transformer.py
@@ -301,16 +301,15 @@ def prepare_encoder_decoder(src_word,
         src_word,
         size=[src_vocab_size, src_emb_dim],
         padding_idx=bos_idx,  # set embedding of bos to 0
-        param_attr=fluid.ParamAttr(
-            name=word_emb_param_name,
-            initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5)))
+        param_attr=fluid.ParamAttr(name=word_emb_param_name,
+                                   initializer=fluid.initializer.Normal(
+                                       0., src_emb_dim**-0.5)))
 
     src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5)
-    src_pos_enc = fluid.embedding(
-        src_pos,
-        size=[src_max_len, src_emb_dim],
-        param_attr=fluid.ParamAttr(
-            name=pos_enc_param_name, trainable=False))
+    src_pos_enc = fluid.embedding(src_pos,
+                                  size=[src_max_len, src_emb_dim],
+                                  param_attr=fluid.ParamAttr(
+                                      name=pos_enc_param_name, trainable=False))
     src_pos_enc.stop_gradient = True
     enc_input = src_word_emb + src_pos_enc
     return layers.dropout(
@@ -537,51 +536,48 @@ def transformer(model_input,
     label = model_input.lbl_word
     weights = model_input.lbl_weight
 
-    enc_output = wrap_encoder(
-        enc_inputs,
-        src_vocab_size,
-        max_length,
-        n_layer,
-        n_head,
-        d_key,
-        d_value,
-        d_model,
-        d_inner_hid,
-        prepostprocess_dropout,
-        attention_dropout,
-        relu_dropout,
-        preprocess_cmd,
-        postprocess_cmd,
-        weight_sharing,
-        bos_idx=bos_idx)
-
-    predict = wrap_decoder(
-        dec_inputs,
-        trg_vocab_size,
-        max_length,
-        n_layer,
-        n_head,
-        d_key,
-        d_value,
-        d_model,
-        d_inner_hid,
-        prepostprocess_dropout,
-        attention_dropout,
-        relu_dropout,
-        preprocess_cmd,
-        postprocess_cmd,
-        weight_sharing,
-        enc_output=enc_output)
+    enc_output = wrap_encoder(enc_inputs,
+                              src_vocab_size,
+                              max_length,
+                              n_layer,
+                              n_head,
+                              d_key,
+                              d_value,
+                              d_model,
+                              d_inner_hid,
+                              prepostprocess_dropout,
+                              attention_dropout,
+                              relu_dropout,
+                              preprocess_cmd,
+                              postprocess_cmd,
+                              weight_sharing,
+                              bos_idx=bos_idx)
+
+    predict = wrap_decoder(dec_inputs,
+                           trg_vocab_size,
+                           max_length,
+                           n_layer,
+                           n_head,
+                           d_key,
+                           d_value,
+                           d_model,
+                           d_inner_hid,
+                           prepostprocess_dropout,
+                           attention_dropout,
+                           relu_dropout,
+                           preprocess_cmd,
+                           postprocess_cmd,
+                           weight_sharing,
+                           enc_output=enc_output)
 
     # Padding index do not contribute to the total loss. The weights is used to
     # cancel padding index in calculating the loss.
     if label_smooth_eps:
         # TODO: use fluid.input.one_hot after softmax_with_cross_entropy removing
         # the enforcement that the last dimension of label must be 1.
-        label = layers.label_smooth(
-            label=layers.one_hot(
-                input=label, depth=trg_vocab_size),
-            epsilon=label_smooth_eps)
+        label = layers.label_smooth(label=layers.one_hot(input=label,
+                                                         depth=trg_vocab_size),
+                                    epsilon=label_smooth_eps)
 
     cost = layers.softmax_with_cross_entropy(
         logits=predict,
@@ -726,23 +722,22 @@ def fast_decode(model_input, src_vocab_size, trg_vocab_size, max_in_len,
     dec_inputs = (model_input.trg_word, model_input.init_score,
                   model_input.init_idx, model_input.trg_src_attn_bias)
 
-    enc_output = wrap_encoder(
-        enc_inputs,
-        src_vocab_size,
-        max_in_len,
-        n_layer,
-        n_head,
-        d_key,
-        d_value,
-        d_model,
-        d_inner_hid,
-        prepostprocess_dropout,
-        attention_dropout,
-        relu_dropout,
-        preprocess_cmd,
-        postprocess_cmd,
-        weight_sharing,
-        bos_idx=bos_idx)
+    enc_output = wrap_encoder(enc_inputs,
+                              src_vocab_size,
+                              max_in_len,
+                              n_layer,
+                              n_head,
+                              d_key,
+                              d_value,
+                              d_model,
+                              d_inner_hid,
+                              prepostprocess_dropout,
+                              attention_dropout,
+                              relu_dropout,
+                              preprocess_cmd,
+                              postprocess_cmd,
+                              weight_sharing,
+                              bos_idx=bos_idx)
     start_tokens, init_scores, parent_idx, trg_src_attn_bias = dec_inputs
 
     def beam_search():
@@ -801,26 +796,25 @@ def fast_decode(model_input, src_vocab_size, trg_vocab_size, max_in_len,
                     dtype=pre_ids.dtype),
                 y=step_idx,
                 axis=0)
-            logits = wrap_decoder(
-                (pre_ids, pre_pos, None, pre_src_attn_bias),
-                trg_vocab_size,
-                max_in_len,
-                n_layer,
-                n_head,
-                d_key,
-                d_value,
-                d_model,
-                d_inner_hid,
-                prepostprocess_dropout,
-                attention_dropout,
-                relu_dropout,
-                preprocess_cmd,
-                postprocess_cmd,
-                weight_sharing,
-                enc_output=enc_output,
-                caches=caches,
-                gather_idx=parent_idx,
-                bos_idx=bos_idx)
+            logits = wrap_decoder((pre_ids, pre_pos, None, pre_src_attn_bias),
+                                  trg_vocab_size,
+                                  max_in_len,
+                                  n_layer,
+                                  n_head,
+                                  d_key,
+                                  d_value,
+                                  d_model,
+                                  d_inner_hid,
+                                  prepostprocess_dropout,
+                                  attention_dropout,
+                                  relu_dropout,
+                                  preprocess_cmd,
+                                  postprocess_cmd,
+                                  weight_sharing,
+                                  enc_output=enc_output,
+                                  caches=caches,
+                                  gather_idx=parent_idx,
+                                  bos_idx=bos_idx)
             # intra-beam topK
             topk_scores, topk_indices = layers.topk(
                 input=layers.softmax(logits), k=beam_size)
diff --git a/PaddleNLP/neural_machine_translation/transformer/.run_ce.sh b/PaddleNLP/neural_machine_translation/transformer/.run_ce.sh
deleted file mode 100755
index 357cb7386114eba8266d1240676c4599aff2de01..0000000000000000000000000000000000000000
--- a/PaddleNLP/neural_machine_translation/transformer/.run_ce.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-DATA_PATH=./dataset/wmt16
-
-train(){
-    python -u main.py \
-        --do_train True \
-        --src_vocab_fpath $DATA_PATH/en_10000.dict \
-        --trg_vocab_fpath $DATA_PATH/de_10000.dict \
-        --special_token '<s>' '<e>' '<unk>' \
-        --training_file $DATA_PATH/wmt16/train \
-        --use_token_batch True \
-        --batch_size 2048 \
-        --sort_type pool \
-        --pool_size 10000 \
-        --print_step 1 \
-        --weight_sharing False \
-        --epoch 20 \
-        --enable_ce True \
-        --random_seed 1000 \
-        --save_checkpoint "" \
-        --save_param ""
-}
-
-cudaid=${transformer:=0} # use 0-th card as default
-export CUDA_VISIBLE_DEVICES=$cudaid
-
-train | python _ce.py
-
-cudaid=${transformer_m:=0,1,2,3} # use 0,1,2,3 card as default
-export CUDA_VISIBLE_DEVICES=$cudaid
-
-train | python _ce.py
\ No newline at end of file
diff --git a/PaddleNLP/neural_machine_translation/transformer/README.md b/PaddleNLP/neural_machine_translation/transformer/README.md
deleted file mode 100644
index 2d5e31a6ee16bda2b043d19b49371091596f3a1f..0000000000000000000000000000000000000000
--- a/PaddleNLP/neural_machine_translation/transformer/README.md
+++ /dev/null
@@ -1,270 +0,0 @@
-## Transformer
-
-以下是本例的简要目录结构及说明：
-
-```text
-.
-├── images               # README 文档中的图片
-├── utils                # 工具包
-├── desc.py              # 输入描述文件
-├── gen_data.sh          # 数据生成脚本
-├── inference_model.py   # 保存 inference_model 的脚本
-├── main.py              # 主程序入口
-├── predict.py           # 预测脚本
-├── reader.py            # 数据读取接口
-├── README.md            # 文档
-├── train.py             # 训练脚本
-├── transformer.py       # 模型定义文件
-└── transformer.yaml     # 配置文件
-```
-
-## 模型简介
-
-机器翻译（machine translation, MT）是利用计算机将一种自然语言(源语言)转换为另一种自然语言(目标语言)的过程，输入为源语言句子，输出为相应的目标语言的句子。
-
-本项目是机器翻译领域主流模型 Transformer 的 PaddlePaddle 实现， 包含模型训练，预测以及使用自定义数据等内容。用户可以基于发布的内容搭建自己的翻译模型。
-
-同时推荐用户参考[ IPython Notebook demo](https://aistudio.baidu.com/aistudio/projectDetail/122281)
-
-## 快速开始
-
-### 安装说明
-
-1. paddle安装
-
-   本项目依赖于 PaddlePaddle 1.6及以上版本或适当的develop版本，请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装
-
-2. 下载代码
-
-    克隆代码库到本地
-    ```shell
-    git clone https://github.com/PaddlePaddle/models.git
-    cd models/PaddleNLP/neural_machine_translation/transformer
-    ```
-
-3. 环境依赖
-
-   请参考PaddlePaddle[安装说明](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/install/index_cn.html)部分的内容
-
-
-### 数据准备
-
-公开数据集：WMT 翻译大赛是机器翻译领域最具权威的国际评测大赛，其中英德翻译任务提供了一个中等规模的数据集，这个数据集是较多论文中使用的数据集，也是 Transformer 论文中用到的一个数据集。我们也将[WMT'16 EN-DE 数据集](http://www.statmt.org/wmt16/translation-task.html)作为示例提供。运行 `gen_data.sh` 脚本进行 WMT'16 EN-DE 数据集的下载和预处理（时间较长，建议后台运行）。数据处理过程主要包括 Tokenize 和 [BPE 编码（byte-pair encoding）](https://arxiv.org/pdf/1508.07909)。运行成功后，将会生成文件夹 `gen_data`，其目录结构如下：
-
-```text
-.
-├── wmt16_ende_data              # WMT16 英德翻译数据
-├── wmt16_ende_data_bpe          # BPE 编码的 WMT16 英德翻译数据
-├── mosesdecoder                 # Moses 机器翻译工具集，包含了 Tokenize、BLEU 评估等脚本
-└── subword-nmt                  # BPE 编码的代码
-```
-
-另外我们也整理提供了一份处理好的 WMT'16 EN-DE 数据以供[下载](https://transformer-res.bj.bcebos.com/wmt16_ende_data_bpe_clean.tar.gz)使用，其中包含词典（`vocab_all.bpe.32000`文件）、训练所需的 BPE 数据（`train.tok.clean.bpe.32000.en-de`文件）、预测所需的 BPE 数据（`newstest2016.tok.bpe.32000.en-de`等文件）和相应的评估预测结果所需的 tokenize 数据（`newstest2016.tok.de`等文件）。
-
-
-自定义数据：如果需要使用自定义数据，本项目程序中可直接支持的数据格式为制表符 \t 分隔的源语言和目标语言句子对，句子中的 token 之间使用空格分隔。提供以上格式的数据文件（可以分多个part，数据读取支持文件通配符）和相应的词典文件即可直接运行。
-
-### 单机训练
-
-以提供的英德翻译数据为例，可以执行以下命令进行模型训练：
-
-```sh
-# open garbage collection to save memory
-export FLAGS_eager_delete_tensor_gb=0.0
-# setting visible devices for training
-export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-
-python -u main.py \
-  --do_train True \
-  --epoch 30 \
-  --src_vocab_fpath gen_data/wmt16_ende_data_bpe/vocab_all.bpe.32000 \
-  --trg_vocab_fpath gen_data/wmt16_ende_data_bpe/vocab_all.bpe.32000 \
-  --special_token '<s>' '<e>' '<unk>' \
-  --training_file gen_data/wmt16_ende_data_bpe/train.tok.clean.bpe.32000.en-de \
-  --batch_size 4096
-```
-
-以上命令中传入了执行训练（`do_train`）、训练轮数（`epoch`）和训练数据文件路径（注意请正确设置，支持通配符）等参数，更多参数的使用以及支持的模型超参数可以参见 `transformer.yaml` 配置文件，其中默认提供了 Transformer base model 的配置，如需调整可以在配置文件中更改或通过命令行传入（命令行传入内容将覆盖配置文件中的设置）。可以通过以下命令来训练 Transformer 论文中的 big model：
-
-```sh
-# open garbage collection to save memory
-export FLAGS_eager_delete_tensor_gb=0.0
-# setting visible devices for training
-export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-
-python -u main.py \
-  --do_train True \
-  --epoch 30 \
-  --src_vocab_fpath gen_data/wmt16_ende_data_bpe/vocab_all.bpe.32000 \
-  --trg_vocab_fpath gen_data/wmt16_ende_data_bpe/vocab_all.bpe.32000 \
-  --special_token '<s>' '<e>' '<unk>' \
-  --training_file gen_data/wmt16_ende_data_bpe/train.tok.clean.bpe.32000.en-de \
-  --batch_size 4096 \
-  --n_head 16 \
-  --d_model 1024 \
-  --d_inner_hid 4096 \
-  --prepostprocess_dropout 0.3
-```
-
-训练时默认使用所有 GPU，可以通过 `CUDA_VISIBLE_DEVICES` 环境变量来设置使用的 GPU 数目。也可以只使用 CPU 训练(通过参数 `--use_cuda False` 设置)，训练速度相对较慢。在执行训练时若提供了 `save_param` 和 `save_checkpoint`（默认为 trained_params 和 trained_ckpts），则每隔一定 iteration 后(通过参数 `save_step` 设置，默认为10000)将分别保存当前训练的参数值和 checkpoint 到相应目录，每隔一定数目的 iteration (通过参数 `print_step` 设置，默认为100)将打印如下的日志到标准输出：
-
-```txt
-[2019-08-02 15:30:51,656 INFO train.py:262] step_idx: 150100, epoch: 32, batch: 1364, avg loss: 2.880427, normalized loss: 1.504687, ppl: 17.821888, speed: 3.34 step/s
-[2019-08-02 15:31:19,824 INFO train.py:262] step_idx: 150200, epoch: 32, batch: 1464, avg loss: 2.955965, normalized loss: 1.580225, ppl: 19.220257, speed: 3.55 step/s
-[2019-08-02 15:31:48,151 INFO train.py:262] step_idx: 150300, epoch: 32, batch: 1564, avg loss: 2.951180, normalized loss: 1.575439, ppl: 19.128502, speed: 3.53 step/s
-[2019-08-02 15:32:16,401 INFO train.py:262] step_idx: 150400, epoch: 32, batch: 1664, avg loss: 3.027281, normalized loss: 1.651540, ppl: 20.641024, speed: 3.54 step/s
-[2019-08-02 15:32:44,764 INFO train.py:262] step_idx: 150500, epoch: 32, batch: 1764, avg loss: 3.069125, normalized loss: 1.693385, ppl: 21.523066, speed: 3.53 step/s
-[2019-08-02 15:33:13,199 INFO train.py:262] step_idx: 150600, epoch: 32, batch: 1864, avg loss: 2.869379, normalized loss: 1.493639, ppl: 17.626074, speed: 3.52 step/s
-[2019-08-02 15:33:41,601 INFO train.py:262] step_idx: 150700, epoch: 32, batch: 1964, avg loss: 2.980905, normalized loss: 1.605164, ppl: 19.705633, speed: 3.52 step/s
-[2019-08-02 15:34:10,079 INFO train.py:262] step_idx: 150800, epoch: 32, batch: 2064, avg loss: 3.047716, normalized loss: 1.671976, ppl: 21.067181, speed: 3.51 step/s
-[2019-08-02 15:34:38,598 INFO train.py:262] step_idx: 150900, epoch: 32, batch: 2164, avg loss: 2.956475, normalized loss: 1.580735, ppl: 19.230072, speed: 3.51 step/s
-```
-
-### 模型推断
-
-以英德翻译数据为例，模型训练完成后可以执行以下命令对指定文件中的文本进行翻译：
-
-```sh
-# open garbage collection to save memory
-export FLAGS_eager_delete_tensor_gb=0.0
-# setting visible devices for prediction
-export CUDA_VISIBLE_DEVICES=0
-
-python -u main.py \
-  --do_predict True \
-  --src_vocab_fpath gen_data/wmt16_ende_data_bpe/vocab_all.bpe.32000 \
-  --trg_vocab_fpath gen_data/wmt16_ende_data_bpe/vocab_all.bpe.32000 \
-  --special_token '<s>' '<e>' '<unk>' \
-  --predict_file gen_data/wmt16_ende_data_bpe/newstest2014.tok.bpe.32000.en-de \
-  --batch_size 32 \
-  --init_from_params trained_params/step_100000 \
-  --beam_size 5 \
-  --max_out_len 255 \
-  --output_file predict.txt
-```
-
- 由 `predict_file` 指定的文件中文本的翻译结果会输出到 `output_file` 指定的文件。执行预测时需要设置 `init_from_params` 来给出模型所在目录，更多参数的使用可以在 `transformer.yaml` 文件中查阅注释说明并进行更改设置。注意若在执行预测时设置了模型超参数，应与模型训练时的设置一致，如若训练时使用 big model 的参数设置，则预测时对应类似如下命令：
-
-```sh
-# open garbage collection to save memory
-export FLAGS_eager_delete_tensor_gb=0.0
-# setting visible devices for prediction
-export CUDA_VISIBLE_DEVICES=0
-
-python -u main.py \
-  --do_predict True \
-  --src_vocab_fpath gen_data/wmt16_ende_data_bpe/vocab_all.bpe.32000 \
-  --trg_vocab_fpath gen_data/wmt16_ende_data_bpe/vocab_all.bpe.32000 \
-  --special_token '<s>' '<e>' '<unk>' \
-  --predict_file gen_data/wmt16_ende_data_bpe/newstest2014.tok.bpe.32000.en-de \
-  --batch_size 32 \
-  --init_from_params trained_params/step_100000 \
-  --beam_size 5 \
-  --max_out_len 255 \
-  --output_file predict.txt \
-  --n_head 16 \
-  --d_model 1024 \
-  --d_inner_hid 4096 \
-  --prepostprocess_dropout 0.3
-```
-
-
-### 模型评估
-
-预测结果中每行输出是对应行输入的得分最高的翻译，对于使用 BPE 的数据，预测出的翻译结果也将是 BPE 表示的数据，要还原成原始的数据（这里指 tokenize 后的数据）才能进行正确的评估。评估过程具体如下（BLEU 是翻译任务常用的自动评估方法指标）：
-
-```sh
-# 还原 predict.txt 中的预测结果为 tokenize 后的数据
-sed -r 's/(@@ )|(@@ ?$)//g' predict.txt > predict.tok.txt
-# 若无 BLEU 评估工具，需先进行下载
-# git clone https://github.com/moses-smt/mosesdecoder.git
-# 以英德翻译 newstest2014 测试数据为例
-perl gen_data/mosesdecoder/scripts/generic/multi-bleu.perl gen_data/wmt16_ende_data/newstest2014.tok.de < predict.tok.txt
-```
-可以看到类似如下的结果：
-```
-BLEU = 26.35, 57.7/32.1/20.0/13.0 (BP=1.000, ratio=1.013, hyp_len=63903, ref_len=63078)
-```
-
-使用本项目中提供的内容，英德翻译 base model 和 big model 八卡训练 100K 个 iteration 后测试有大约如下的 BLEU 值：
-
-| 测试集 | newstest2014 | newstest2015 | newstest2016 |
-|-|-|-|-|
-| Base | 26.35 | 29.07 | 33.30 |
-| Big | 27.07 | 30.09 | 34.38 |
-
-### 预训练模型
-
-我们这里提供了对应有以上 BLEU 值的 [base model](https://transformer-res.bj.bcebos.com/base_model_params.tar.gz) 和 [big model](https://transformer-res.bj.bcebos.com/big_model_params.tar.gz) 的模型参数提供下载使用（注意，模型使用了提供下载的数据进行训练和测试）。
-
-## 进阶使用
-
-### 背景介绍
-
-Transformer 是论文 [Attention Is All You Need](https://arxiv.org/abs/1706.03762) 中提出的用以完成机器翻译（machine translation, MT）等序列到序列（sequence to sequence, Seq2Seq）学习任务的一种全新网络结构，其完全使用注意力（Attention）机制来实现序列到序列的建模[1]。
-
-相较于此前 Seq2Seq 模型中广泛使用的循环神经网络（Recurrent Neural Network, RNN），使用（Self）Attention 进行输入序列到输出序列的变换主要具有以下优势：
-
-- 计算复杂度小
-  - 特征维度为 d 、长度为 n 的序列，在 RNN 中计算复杂度为 `O(n * d * d)` （n 个时间步，每个时间步计算 d 维的矩阵向量乘法），在 Self-Attention 中计算复杂度为 `O(n * n * d)` （n 个时间步两两计算 d 维的向量点积或其他相关度函数），n 通常要小于 d 。
-- 计算并行度高
-  - RNN 中当前时间步的计算要依赖前一个时间步的计算结果；Self-Attention 中各时间步的计算只依赖输入不依赖之前时间步输出，各时间步可以完全并行。
-- 容易学习长程依赖（long-range dependencies）
-  - RNN 中相距为 n 的两个位置间的关联需要 n 步才能建立；Self-Attention 中任何两个位置都直接相连；路径越短信号传播越容易。
-
-Transformer 中引入使用的基于 Self-Attention 的序列建模模块结构，已被广泛应用在 Bert [2]等语义表示模型中，取得了显著效果。
-
-
-### 模型概览
-
-Transformer 同样使用了 Seq2Seq 模型中典型的编码器-解码器（Encoder-Decoder）的框架结构，整体网络结构如图1所示。
-
-<p align="center">
-<img src="images/transformer_network.png" height=400 hspace='10'/> <br />
-图 1. Transformer 网络结构图
-</p>
-
-可以看到，和以往 Seq2Seq 模型不同，Transformer 的 Encoder 和 Decoder 中不再使用 RNN 的结构。
-
-### 模型特点
-
-Transformer 中的 Encoder 由若干相同的 layer 堆叠组成，每个 layer 主要由多头注意力（Multi-Head Attention）和全连接的前馈（Feed-Forward）网络这两个 sub-layer 构成。
-- Multi-Head Attention 在这里用于实现 Self-Attention，相比于简单的 Attention 机制，其将输入进行多路线性变换后分别计算 Attention 的结果，并将所有结果拼接后再次进行线性变换作为输出。参见图2，其中 Attention 使用的是点积（Dot-Product），并在点积后进行了 scale 的处理以避免因点积结果过大进入 softmax 的饱和区域。
-- Feed-Forward 网络会对序列中的每个位置进行相同的计算（Position-wise），其采用的是两次线性变换中间加以 ReLU 激活的结构。
-
-此外，每个 sub-layer 后还施以 Residual Connection [3]和 Layer Normalization [4]来促进梯度传播和模型收敛。
-
-<p align="center">
-<img src="images/multi_head_attention.png" height=300 hspace='10'/> <br />
-图 2. Multi-Head Attention
-</p>
-
-Decoder 具有和 Encoder 类似的结构，只是相比于组成 Encoder 的 layer ，在组成 Decoder 的 layer 中还多了一个 Multi-Head Attention 的 sub-layer 来实现对 Encoder 输出的 Attention，这个 Encoder-Decoder Attention 在其他 Seq2Seq 模型中也是存在的。
-
-## FAQ
-
-**Q:** 预测结果中样本数少于输入的样本数是什么原因  
-**A:** 若样本中最大长度超过 `transformer.yaml` 中 `max_length` 的默认设置，请注意运行时增大 `--max_length` 的设置，否则超长样本将被过滤。
-
-**Q:** 预测时最大长度超过了训练时的最大长度怎么办  
-**A:** 由于训练时 `max_length` 的设置决定了保存模型 position encoding 的大小，若预测时长度超过 `max_length`，请调大该值，会重新生成更大的 position encoding 表。
-
-
-## 参考文献
-1. Vaswani A, Shazeer N, Parmar N, et al. [Attention is all you need](http://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf)[C]//Advances in Neural Information Processing Systems. 2017: 6000-6010.
-2. Devlin J, Chang M W, Lee K, et al. [Bert: Pre-training of deep bidirectional transformers for language understanding](https://arxiv.org/abs/1810.04805)[J]. arXiv preprint arXiv:1810.04805, 2018.
-3. He K, Zhang X, Ren S, et al. [Deep residual learning for image recognition](http://openaccess.thecvf.com/content_cvpr_2016/papers/He_Deep_Residual_Learning_CVPR_2016_paper.pdf)[C]//Proceedings of the IEEE conference on computer vision and pattern recognition. 2016: 770-778.
-4. Ba J L, Kiros J R, Hinton G E. [Layer normalization](https://arxiv.org/pdf/1607.06450.pdf)[J]. arXiv preprint arXiv:1607.06450, 2016.
-5. Sennrich R, Haddow B, Birch A. [Neural machine translation of rare words with subword units](https://arxiv.org/pdf/1508.07909)[J]. arXiv preprint arXiv:1508.07909, 2015.
-
-
-## 版本更新
-
-2019/08/16 进行了规范化，更新了 Paddle 接口的使用
-
-## 作者
-- [guochengCS](https://github.com/guoshengCS)
-
-## 如何贡献代码
-
-如果你可以修复某个issue或者增加一个新功能，欢迎给我们提交PR。如果对应的PR被接受了，我们将根据贡献的质量和难度进行打分（0-5分，越高越好）。如果你累计获得了10分，可以联系我们获得面试机会或者为你写推荐信。
diff --git a/PaddleNLP/neural_machine_translation/transformer/__init__.py b/PaddleNLP/neural_machine_translation/transformer/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/PaddleNLP/neural_machine_translation/transformer/_ce.py b/PaddleNLP/neural_machine_translation/transformer/_ce.py
deleted file mode 100644
index c619b51377a446bed2872480d6e39dd6a61507ab..0000000000000000000000000000000000000000
--- a/PaddleNLP/neural_machine_translation/transformer/_ce.py
+++ /dev/null
@@ -1,68 +0,0 @@
-####this file is only used for continuous evaluation test!
-
-import os
-import sys
-sys.path.insert(0, os.environ['ceroot'])
-from kpi import CostKpi, DurationKpi, AccKpi
-
-#### NOTE kpi.py should shared in models in some way!!!!
-
-train_cost_card1_kpi = CostKpi('train_cost_card1', 0.002, 0, actived=True)
-# test_cost_card1_kpi = CostKpi('test_cost_card1', 0.008, 0, actived=True)
-train_duration_card1_kpi = DurationKpi(
-    'train_duration_card1', 0.006, 0, actived=True)
-train_cost_card4_kpi = CostKpi('train_cost_card4', 0.001, 0, actived=True)
-# test_cost_card4_kpi = CostKpi('test_cost_card4', 0.001, 0, actived=True)
-train_duration_card4_kpi = DurationKpi(
-    'train_duration_card4', 0.02, 0, actived=True)
-
-tracking_kpis = [
-    train_cost_card1_kpi,
-    # test_cost_card1_kpi,
-    train_duration_card1_kpi,
-    train_cost_card4_kpi,
-    # test_cost_card4_kpi,
-    train_duration_card4_kpi,
-]
-
-
-def parse_log(log):
-    '''
-    This method should be implemented by model developers.
-    The suggestion:
-    each line in the log should be key, value, for example:
-    "
-    train_cost\t1.0
-    test_cost\t1.0
-    train_cost\t1.0
-    train_cost\t1.0
-    train_acc\t1.2
-    "
-    '''
-    for line in log.split('\n'):
-        fs = line.strip().split('\t')
-        print(fs)
-        if len(fs) == 3 and fs[0] == 'kpis':
-            print("-----%s" % fs)
-            kpi_name = fs[1]
-            kpi_value = float(fs[2])
-            yield kpi_name, kpi_value
-
-
-def log_to_ce(log):
-    kpi_tracker = {}
-    for kpi in tracking_kpis:
-        kpi_tracker[kpi.name] = kpi
-
-    for (kpi_name, kpi_value) in parse_log(log):
-        print(kpi_name, kpi_value)
-        kpi_tracker[kpi_name].add_record(kpi_value)
-        kpi_tracker[kpi_name].persist()
-
-
-if __name__ == '__main__':
-    log = sys.stdin.read()
-    print("*****")
-    print(log)
-    print("****")
-    log_to_ce(log)
diff --git a/PaddleNLP/neural_machine_translation/transformer/desc.py b/PaddleNLP/neural_machine_translation/transformer/desc.py
deleted file mode 100644
index d6c34191cd5f182b17eaabbce29c811985e97703..0000000000000000000000000000000000000000
--- a/PaddleNLP/neural_machine_translation/transformer/desc.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# The placeholder for batch_size in compile time. Must be -1 currently to be
-# consistent with some ops' infer-shape output in compile time, such as the
-# sequence_expand op used in beamsearch decoder.
-batch_size = None
-# The placeholder for squence length in compile time.
-seq_len = None
-# The placeholder for head number in compile time.
-n_head = 8
-# The placeholder for model dim in compile time.
-d_model = 512
-# Here list the data shapes and data types of all inputs.
-# The shapes here act as placeholder and are set to pass the infer-shape in
-# compile time.
-input_descs = {
-    # The actual data shape of src_word is:
-    # [batch_size, max_src_len_in_batch]
-    "src_word": [(batch_size, seq_len), "int64", 2],
-    # The actual data shape of src_pos is:
-    # [batch_size, max_src_len_in_batch, 1]
-    "src_pos": [(batch_size, seq_len), "int64"],
-    # This input is used to remove attention weights on paddings in the
-    # encoder.
-    # The actual data shape of src_slf_attn_bias is:
-    # [batch_size, n_head, max_src_len_in_batch, max_src_len_in_batch]
-    "src_slf_attn_bias": [(batch_size, n_head, seq_len, seq_len), "float32"],
-    # The actual data shape of trg_word is:
-    # [batch_size, max_trg_len_in_batch, 1]
-    "trg_word": [(batch_size, seq_len), "int64",
-                 2],  # lod_level is only used in fast decoder.
-    # The actual data shape of trg_pos is:
-    # [batch_size, max_trg_len_in_batch, 1]
-    "trg_pos": [(batch_size, seq_len), "int64"],
-    # This input is used to remove attention weights on paddings and
-    # subsequent words in the decoder.
-    # The actual data shape of trg_slf_attn_bias is:
-    # [batch_size, n_head, max_trg_len_in_batch, max_trg_len_in_batch]
-    "trg_slf_attn_bias": [(batch_size, n_head, seq_len, seq_len), "float32"],
-    # This input is used to remove attention weights on paddings of the source
-    # input in the encoder-decoder attention.
-    # The actual data shape of trg_src_attn_bias is:
-    # [batch_size, n_head, max_trg_len_in_batch, max_src_len_in_batch]
-    "trg_src_attn_bias": [(batch_size, n_head, seq_len, seq_len), "float32"],
-    # This input is used in independent decoder program for inference.
-    # The actual data shape of enc_output is:
-    # [batch_size, max_src_len_in_batch, d_model]
-    "enc_output": [(batch_size, seq_len, d_model), "float32"],
-    # The actual data shape of label_word is:
-    # [batch_size * max_trg_len_in_batch, 1]
-    "lbl_word": [(None, 1), "int64"],
-    # This input is used to mask out the loss of paddding tokens.
-    # The actual data shape of label_weight is:
-    # [batch_size * max_trg_len_in_batch, 1]
-    "lbl_weight": [(None, 1), "float32"],
-    # This input is used in beam-search decoder.
-    "init_score": [(batch_size, 1), "float32", 2],
-    # This input is used in beam-search decoder for the first gather
-    # (cell states updation)
-    "init_idx": [(batch_size, ), "int32"],
-}
-
-# Names of word embedding table which might be reused for weight sharing.
-word_emb_param_names = (
-    "src_word_emb_table",
-    "trg_word_emb_table", )
-# Names of position encoding table which will be initialized externally.
-pos_enc_param_names = (
-    "src_pos_enc_table",
-    "trg_pos_enc_table", )
-# separated inputs for different usages.
-encoder_data_input_fields = (
-    "src_word",
-    "src_pos",
-    "src_slf_attn_bias", )
-decoder_data_input_fields = (
-    "trg_word",
-    "trg_pos",
-    "trg_slf_attn_bias",
-    "trg_src_attn_bias",
-    "enc_output", )
-label_data_input_fields = (
-    "lbl_word",
-    "lbl_weight", )
-# In fast decoder, trg_pos (only containing the current time step) is generated
-# by ops and trg_slf_attn_bias is not needed.
-fast_decoder_data_input_fields = (
-    "trg_word",
-    "init_score",
-    "init_idx",
-    "trg_src_attn_bias", )
diff --git a/PaddleNLP/neural_machine_translation/transformer/gen_data.sh b/PaddleNLP/neural_machine_translation/transformer/gen_data.sh
deleted file mode 100644
index e00ae05d9c5cc59b7b401428f6e1252397debfe9..0000000000000000000000000000000000000000
--- a/PaddleNLP/neural_machine_translation/transformer/gen_data.sh
+++ /dev/null
@@ -1,220 +0,0 @@
-#! /usr/bin/env bash
-
-set -e
-
-OUTPUT_DIR=$PWD/gen_data
-
-###############################################################################
-# change these variables for other WMT data
-###############################################################################
-OUTPUT_DIR_DATA="${OUTPUT_DIR}/wmt16_ende_data"
-OUTPUT_DIR_BPE_DATA="${OUTPUT_DIR}/wmt16_ende_data_bpe"
-LANG1="en"
-LANG2="de"
-# each of TRAIN_DATA: data_url data_file_lang1 data_file_lang2
-TRAIN_DATA=(
-'http://www.statmt.org/europarl/v7/de-en.tgz'
-'europarl-v7.de-en.en' 'europarl-v7.de-en.de'
-'http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz'
-'commoncrawl.de-en.en' 'commoncrawl.de-en.de'
-'http://data.statmt.org/wmt16/translation-task/training-parallel-nc-v11.tgz'
-'news-commentary-v11.de-en.en' 'news-commentary-v11.de-en.de'
-)
-# each of DEV_TEST_DATA: data_url data_file_lang1 data_file_lang2
-DEV_TEST_DATA=(
-'http://data.statmt.org/wmt16/translation-task/dev.tgz'
-'newstest201[45]-deen-ref.en.sgm' 'newstest201[45]-deen-src.de.sgm'
-'http://data.statmt.org/wmt16/translation-task/test.tgz'
-'newstest2016-deen-ref.en.sgm' 'newstest2016-deen-src.de.sgm'
-)
-###############################################################################
-
-###############################################################################
-# change these variables for other WMT data
-###############################################################################
-# OUTPUT_DIR_DATA="${OUTPUT_DIR}/wmt14_enfr_data"
-# OUTPUT_DIR_BPE_DATA="${OUTPUT_DIR}/wmt14_enfr_data_bpe"
-# LANG1="en"
-# LANG2="fr"
-# # each of TRAIN_DATA: ata_url data_tgz data_file 
-# TRAIN_DATA=(
-# 'http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz'
-# 'commoncrawl.fr-en.en' 'commoncrawl.fr-en.fr'
-# 'http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz'
-# 'training/europarl-v7.fr-en.en' 'training/europarl-v7.fr-en.fr'
-# 'http://www.statmt.org/wmt14/training-parallel-nc-v9.tgz'
-# 'training/news-commentary-v9.fr-en.en' 'training/news-commentary-v9.fr-en.fr'
-# 'http://www.statmt.org/wmt10/training-giga-fren.tar'
-# 'giga-fren.release2.fixed.en.*' 'giga-fren.release2.fixed.fr.*'
-# 'http://www.statmt.org/wmt13/training-parallel-un.tgz'
-# 'un/undoc.2000.fr-en.en' 'un/undoc.2000.fr-en.fr'
-# )
-# # each of DEV_TEST_DATA: data_url data_tgz data_file_lang1 data_file_lang2
-# DEV_TEST_DATA=(
-# 'http://data.statmt.org/wmt16/translation-task/dev.tgz'
-# '.*/newstest201[45]-fren-ref.en.sgm' '.*/newstest201[45]-fren-src.fr.sgm'
-# 'http://data.statmt.org/wmt16/translation-task/test.tgz'
-# '.*/newstest2016-fren-ref.en.sgm' '.*/newstest2016-fren-src.fr.sgm'
-# )
-###############################################################################
-
-mkdir -p $OUTPUT_DIR_DATA $OUTPUT_DIR_BPE_DATA
-
-# Extract training data
-for ((i=0;i<${#TRAIN_DATA[@]};i+=3)); do
-  data_url=${TRAIN_DATA[i]}
-  data_tgz=${data_url##*/}  # training-parallel-commoncrawl.tgz
-  data=${data_tgz%.*}  # training-parallel-commoncrawl
-  data_lang1=${TRAIN_DATA[i+1]}
-  data_lang2=${TRAIN_DATA[i+2]}
-  if [ ! -e ${OUTPUT_DIR_DATA}/${data_tgz} ]; then
-    echo "Download "${data_url}
-    wget -O ${OUTPUT_DIR_DATA}/${data_tgz} ${data_url}
-  fi
-
-  if [ ! -d ${OUTPUT_DIR_DATA}/${data} ]; then
-    echo "Extract "${data_tgz}
-    mkdir -p ${OUTPUT_DIR_DATA}/${data}
-    tar_type=${data_tgz:0-3}
-    if [ ${tar_type} == "tar" ]; then
-      tar -xvf ${OUTPUT_DIR_DATA}/${data_tgz} -C ${OUTPUT_DIR_DATA}/${data}
-    else
-      tar -xvzf ${OUTPUT_DIR_DATA}/${data_tgz} -C ${OUTPUT_DIR_DATA}/${data}
-    fi
-  fi
-  # concatenate all training data
-  for data_lang in $data_lang1 $data_lang2; do
-    for f in `find ${OUTPUT_DIR_DATA}/${data} -regex ".*/${data_lang}"`; do
-      data_dir=`dirname $f`
-      data_file=`basename $f`
-      f_base=${f%.*}
-      f_ext=${f##*.}
-      if [ $f_ext == "gz" ]; then
-        gunzip $f
-        l=${f_base##*.}
-        f_base=${f_base%.*}
-      else
-        l=${f_ext}
-      fi
-      
-      if [ $i -eq 0 ]; then
-        cat ${f_base}.$l > ${OUTPUT_DIR_DATA}/train.$l
-      else
-        cat ${f_base}.$l >> ${OUTPUT_DIR_DATA}/train.$l
-      fi
-    done
-  done
-done
-
-# Clone mosesdecoder
-if [ ! -d ${OUTPUT_DIR}/mosesdecoder ]; then
-  echo "Cloning moses for data processing"
-  git clone https://github.com/moses-smt/mosesdecoder.git ${OUTPUT_DIR}/mosesdecoder
-fi
-
-# Extract develop and test data
-dev_test_data=""
-for ((i=0;i<${#DEV_TEST_DATA[@]};i+=3)); do
-  data_url=${DEV_TEST_DATA[i]}
-  data_tgz=${data_url##*/}  # training-parallel-commoncrawl.tgz
-  data=${data_tgz%.*}  # training-parallel-commoncrawl
-  data_lang1=${DEV_TEST_DATA[i+1]}
-  data_lang2=${DEV_TEST_DATA[i+2]}
-  if [ ! -e ${OUTPUT_DIR_DATA}/${data_tgz} ]; then
-    echo "Download "${data_url}
-    wget -O ${OUTPUT_DIR_DATA}/${data_tgz} ${data_url}
-  fi
-
-  if [ ! -d ${OUTPUT_DIR_DATA}/${data} ]; then
-    echo "Extract "${data_tgz}
-    mkdir -p ${OUTPUT_DIR_DATA}/${data}
-    tar_type=${data_tgz:0-3}
-    if [ ${tar_type} == "tar" ]; then
-      tar -xvf ${OUTPUT_DIR_DATA}/${data_tgz} -C ${OUTPUT_DIR_DATA}/${data}
-    else
-      tar -xvzf ${OUTPUT_DIR_DATA}/${data_tgz} -C ${OUTPUT_DIR_DATA}/${data}
-    fi
-  fi
-
-  for data_lang in $data_lang1 $data_lang2; do
-    for f in `find ${OUTPUT_DIR_DATA}/${data} -regex ".*/${data_lang}"`; do
-      data_dir=`dirname $f`
-      data_file=`basename $f`
-      data_out=`echo ${data_file} | cut -d '-' -f 1`  # newstest2016
-      l=`echo ${data_file} | cut -d '.' -f 2`  # en
-      dev_test_data="${dev_test_data}\|${data_out}"  # to make regexp
-      if [ ! -e ${OUTPUT_DIR_DATA}/${data_out}.$l ]; then
-        ${OUTPUT_DIR}/mosesdecoder/scripts/ems/support/input-from-sgm.perl \
-          < $f > ${OUTPUT_DIR_DATA}/${data_out}.$l
-      fi
-    done
-  done
-done
-
-# Tokenize data
-for l in ${LANG1} ${LANG2}; do
-  for f in `ls ${OUTPUT_DIR_DATA}/*.$l | grep "\(train${dev_test_data}\)\.$l$"`; do
-    f_base=${f%.*}  # dir/train dir/newstest2016
-    f_out=$f_base.tok.$l
-    if [ ! -e $f_out ]; then
-      echo "Tokenize "$f
-      ${OUTPUT_DIR}/mosesdecoder/scripts/tokenizer/tokenizer.perl -q -l $l -threads 8 < $f > $f_out
-    fi
-  done
-done
-
-# Clean data
-for f in ${OUTPUT_DIR_DATA}/train.${LANG1} ${OUTPUT_DIR_DATA}/train.tok.${LANG1}; do
-  f_base=${f%.*}  # dir/train dir/train.tok
-  f_out=${f_base}.clean
-  if [ ! -e $f_out.${LANG1} ] && [ ! -e $f_out.${LANG2} ]; then
-    echo "Clean "${f_base}
-    ${OUTPUT_DIR}/mosesdecoder/scripts/training/clean-corpus-n.perl $f_base ${LANG1} ${LANG2} ${f_out} 1 80
-  fi
-done
-
-# Clone subword-nmt and generate BPE data
-if [ ! -d ${OUTPUT_DIR}/subword-nmt ]; then
-  git clone https://github.com/rsennrich/subword-nmt.git ${OUTPUT_DIR}/subword-nmt
-fi
-
-# Generate BPE data and vocabulary
-for num_operations in 32000; do
-  if [ ! -e ${OUTPUT_DIR_BPE_DATA}/bpe.${num_operations} ]; then
-    echo "Learn BPE with ${num_operations} merge operations"
-    cat ${OUTPUT_DIR_DATA}/train.tok.clean.${LANG1} ${OUTPUT_DIR_DATA}/train.tok.clean.${LANG2} | \
-      ${OUTPUT_DIR}/subword-nmt/learn_bpe.py -s $num_operations > ${OUTPUT_DIR_BPE_DATA}/bpe.${num_operations}
-  fi
-
-  for l in ${LANG1} ${LANG2}; do
-    for f in `ls ${OUTPUT_DIR_DATA}/*.$l | grep "\(train${dev_test_data}\)\.tok\(\.clean\)\?\.$l$"`; do
-      f_base=${f%.*}  # dir/train.tok dir/train.tok.clean dir/newstest2016.tok
-      f_base=${f_base##*/}  # train.tok train.tok.clean newstest2016.tok
-      f_out=${OUTPUT_DIR_BPE_DATA}/${f_base}.bpe.${num_operations}.$l
-      if [ ! -e $f_out ]; then
-        echo "Apply BPE to "$f
-        ${OUTPUT_DIR}/subword-nmt/apply_bpe.py -c ${OUTPUT_DIR_BPE_DATA}/bpe.${num_operations} < $f > $f_out
-      fi
-    done
-  done
-
-  if [ ! -e ${OUTPUT_DIR_BPE_DATA}/vocab.bpe.${num_operations} ]; then
-    echo "Create vocabulary for BPE data"
-    cat ${OUTPUT_DIR_BPE_DATA}/train.tok.clean.bpe.${num_operations}.${LANG1} ${OUTPUT_DIR_BPE_DATA}/train.tok.clean.bpe.${num_operations}.${LANG2} | \
-      ${OUTPUT_DIR}/subword-nmt/get_vocab.py | cut -f1 -d ' ' > ${OUTPUT_DIR_BPE_DATA}/vocab.bpe.${num_operations}
-  fi
-done
-
-# Adapt to the reader
-for f in ${OUTPUT_DIR_BPE_DATA}/*.bpe.${num_operations}.${LANG1}; do
-  f_base=${f%.*}  # dir/train.tok.clean.bpe.32000 dir/newstest2016.tok.bpe.32000
-  f_out=${f_base}.${LANG1}-${LANG2}
-  if [ ! -e $f_out ]; then
-    paste -d '\t' $f_base.${LANG1} $f_base.${LANG2} > $f_out
-  fi
-done
-if [ ! -e ${OUTPUT_DIR_BPE_DATA}/vocab_all.bpe.${num_operations} ]; then
-  sed '1i\<s>\n<e>\n<unk>' ${OUTPUT_DIR_BPE_DATA}/vocab.bpe.${num_operations} > ${OUTPUT_DIR_BPE_DATA}/vocab_all.bpe.${num_operations}
-fi
-
-echo "All done."
diff --git a/PaddleNLP/neural_machine_translation/transformer/images/multi_head_attention.png b/PaddleNLP/neural_machine_translation/transformer/images/multi_head_attention.png
deleted file mode 100644
index 427fb6b32aaeb7013066a167aab4fb97c024c2d6..0000000000000000000000000000000000000000
Binary files a/PaddleNLP/neural_machine_translation/transformer/images/multi_head_attention.png and /dev/null differ
diff --git a/PaddleNLP/neural_machine_translation/transformer/images/transformer_network.png b/PaddleNLP/neural_machine_translation/transformer/images/transformer_network.png
deleted file mode 100644
index 34be0e5c7e2b08f858683d86353db5e81049c7ca..0000000000000000000000000000000000000000
Binary files a/PaddleNLP/neural_machine_translation/transformer/images/transformer_network.png and /dev/null differ
diff --git a/PaddleNLP/neural_machine_translation/transformer/inference_model.py b/PaddleNLP/neural_machine_translation/transformer/inference_model.py
deleted file mode 100644
index d1b88f5be965f85eac1bc703ee10f7cca57bcb78..0000000000000000000000000000000000000000
--- a/PaddleNLP/neural_machine_translation/transformer/inference_model.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-import six
-import sys
-import time
-
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-
-#include palm for easier nlp coding
-from palm.toolkit.input_field import InputField
-from palm.toolkit.configure import PDConfig
-
-# include task-specific libs
-import desc
-import reader
-from transformer import create_net
-
-
-def init_from_pretrain_model(args, exe, program):
-
-    assert isinstance(args.init_from_pretrain_model, str)
-
-    if not os.path.exists(args.init_from_pretrain_model):
-        raise Warning("The pretrained params do not exist.")
-        return False
-
-    def existed_params(var):
-        if not isinstance(var, fluid.framework.Parameter):
-            return False
-        return os.path.exists(
-            os.path.join(args.init_from_pretrain_model, var.name))
-
-    fluid.io.load_vars(
-        exe,
-        args.init_from_pretrain_model,
-        main_program=program,
-        predicate=existed_params)
-
-    print("finish initing model from pretrained params from %s" %
-          (args.init_from_pretrain_model))
-
-    return True
-
-
-def init_from_params(args, exe, program):
-
-    assert isinstance(args.init_from_params, str)
-
-    if not os.path.exists(args.init_from_params):
-        raise Warning("the params path does not exist.")
-        return False
-
-    fluid.io.load_params(
-        executor=exe,
-        dirname=args.init_from_params,
-        main_program=program,
-        filename="params.pdparams")
-
-    print("finish init model from params from %s" % (args.init_from_params))
-
-    return True
-
-
-def do_save_inference_model(args):
-    if args.use_cuda:
-        dev_count = fluid.core.get_cuda_device_count()
-        place = fluid.CUDAPlace(0)
-    else:
-        dev_count = int(os.environ.get('CPU_NUM', 1))
-        place = fluid.CPUPlace()
-
-    test_prog = fluid.default_main_program()
-    startup_prog = fluid.default_startup_program()
-
-    with fluid.program_guard(test_prog, startup_prog):
-        with fluid.unique_name.guard():
-
-            # define input and reader
-
-            input_field_names = desc.encoder_data_input_fields + desc.fast_decoder_data_input_fields
-            input_slots = [{
-                "name": name,
-                "shape": desc.input_descs[name][0],
-                "dtype": desc.input_descs[name][1]
-            } for name in input_field_names]
-
-            input_field = InputField(input_slots)
-            input_field.build(build_pyreader=True)
-
-            # define the network
-
-            predictions = create_net(
-                is_training=False, model_input=input_field, args=args)
-            out_ids, out_scores = predictions
-
-    # This is used here to set dropout to the test mode.
-    test_prog = test_prog.clone(for_test=True)
-
-    # prepare predicting
-
-    ## define the executor and program for training
-
-    exe = fluid.Executor(place)
-
-    exe.run(startup_prog)
-    assert (args.init_from_params) or (args.init_from_pretrain_model)
-
-    if args.init_from_params:
-        init_from_params(args, exe, test_prog)
-
-    elif args.init_from_pretrain_model:
-        init_from_pretrain_model(args, exe, test_prog)
-
-    # saving inference model
-
-    fluid.io.save_inference_model(
-        args.inference_model_dir,
-        feeded_var_names=input_field_names,
-        target_vars=[out_ids, out_scores],
-        executor=exe,
-        main_program=test_prog,
-        model_filename="model.pdmodel",
-        params_filename="params.pdparams")
-
-    print("save inference model at %s" % (args.inference_model_dir))
-
-
-if __name__ == "__main__":
-    args = PDConfig(yaml_file="./transformer.yaml")
-    args.build()
-    args.Print()
-
-    do_save_inference_model(args)
diff --git a/PaddleNLP/neural_machine_translation/transformer/main.py b/PaddleNLP/neural_machine_translation/transformer/main.py
deleted file mode 100644
index 6ff929af0e72296bc635a56d90d2c0925b5bad68..0000000000000000000000000000000000000000
--- a/PaddleNLP/neural_machine_translation/transformer/main.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-import logging
-
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-
-#include palm for easier nlp coding
-from palm.toolkit.configure import PDConfig
-
-from train import do_train
-from predict import do_predict
-from inference_model import do_save_inference_model
-
-if __name__ == "__main__":
-    LOG_FORMAT = "[%(asctime)s %(levelname)s %(filename)s:%(lineno)d] %(message)s"
-    logging.basicConfig(
-        stream=sys.stdout, level=logging.DEBUG, format=LOG_FORMAT)
-    logging.getLogger().setLevel(logging.INFO)
-
-    args = PDConfig(yaml_file="./transformer.yaml")
-    args.build()
-    args.Print()
-
-    if args.do_train:
-        do_train(args)
-
-    if args.do_predict:
-        do_predict(args)
-
-    if args.do_save_inference_model:
-        do_save_inference_model(args)
\ No newline at end of file
diff --git a/PaddleNLP/neural_machine_translation/transformer/predict.py b/PaddleNLP/neural_machine_translation/transformer/predict.py
deleted file mode 100644
index 7ad847fd313ae688e04cea4373912280e220358a..0000000000000000000000000000000000000000
--- a/PaddleNLP/neural_machine_translation/transformer/predict.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-import six
-import sys
-import time
-
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-
-from utils.input_field import InputField
-from utils.configure import PDConfig
-from utils.check import check_gpu, check_version
-
-# include task-specific libs
-import desc
-import reader
-from transformer import create_net, position_encoding_init
-
-
-def init_from_pretrain_model(args, exe, program):
-
-    assert isinstance(args.init_from_pretrain_model, str)
-
-    if not os.path.exists(args.init_from_pretrain_model):
-        raise Warning("The pretrained params do not exist.")
-        return False
-
-    def existed_params(var):
-        if not isinstance(var, fluid.framework.Parameter):
-            return False
-        return os.path.exists(
-            os.path.join(args.init_from_pretrain_model, var.name))
-
-    fluid.io.load_vars(
-        exe,
-        args.init_from_pretrain_model,
-        main_program=program,
-        predicate=existed_params)
-
-    print("finish initing model from pretrained params from %s" %
-          (args.init_from_pretrain_model))
-
-    return True
-
-
-def init_from_params(args, exe, program):
-
-    assert isinstance(args.init_from_params, str)
-
-    if not os.path.exists(args.init_from_params):
-        raise Warning("the params path does not exist.")
-        return False
-
-    fluid.io.load_params(
-        executor=exe,
-        dirname=args.init_from_params,
-        main_program=program,
-        filename="params.pdparams")
-
-    print("finish init model from params from %s" % (args.init_from_params))
-
-    return True
-
-
-def post_process_seq(seq, bos_idx, eos_idx, output_bos=False, output_eos=False):
-    """
-    Post-process the beam-search decoded sequence. Truncate from the first
-    <eos> and remove the <bos> and <eos> tokens currently.
-    """
-    eos_pos = len(seq) - 1
-    for i, idx in enumerate(seq):
-        if idx == eos_idx:
-            eos_pos = i
-            break
-    seq = [
-        idx for idx in seq[:eos_pos + 1]
-        if (output_bos or idx != bos_idx) and (output_eos or idx != eos_idx)
-    ]
-    return seq
-
-
-def do_predict(args):
-    if args.use_cuda:
-        dev_count = fluid.core.get_cuda_device_count()
-        place = fluid.CUDAPlace(0)
-    else:
-        dev_count = int(os.environ.get('CPU_NUM', 1))
-        place = fluid.CPUPlace()
-    # define the data generator
-    processor = reader.DataProcessor(
-        fpattern=args.predict_file,
-        src_vocab_fpath=args.src_vocab_fpath,
-        trg_vocab_fpath=args.trg_vocab_fpath,
-        token_delimiter=args.token_delimiter,
-        use_token_batch=False,
-        batch_size=args.batch_size,
-        device_count=dev_count,
-        pool_size=args.pool_size,
-        sort_type=reader.SortType.NONE,
-        shuffle=False,
-        shuffle_batch=False,
-        start_mark=args.special_token[0],
-        end_mark=args.special_token[1],
-        unk_mark=args.special_token[2],
-        max_length=args.max_length,
-        n_head=args.n_head)
-    batch_generator = processor.data_generator(phase="predict", place=place)
-    args.src_vocab_size, args.trg_vocab_size, args.bos_idx, args.eos_idx, \
-        args.unk_idx = processor.get_vocab_summary()
-    trg_idx2word = reader.DataProcessor.load_dict(
-        dict_path=args.trg_vocab_fpath, reverse=True)
-
-    test_prog = fluid.default_main_program()
-    startup_prog = fluid.default_startup_program()
-
-    with fluid.program_guard(test_prog, startup_prog):
-        with fluid.unique_name.guard():
-
-            # define input and reader
-
-            input_field_names = desc.encoder_data_input_fields + desc.fast_decoder_data_input_fields
-            input_slots = [{
-                "name": name,
-                "shape": desc.input_descs[name][0],
-                "dtype": desc.input_descs[name][1]
-            } for name in input_field_names]
-
-            input_field = InputField(input_slots)
-            input_field.build(build_pyreader=True)
-
-            # define the network
-
-            predictions = create_net(
-                is_training=False, model_input=input_field, args=args)
-            out_ids, out_scores = predictions
-
-    # This is used here to set dropout to the test mode.
-    test_prog = test_prog.clone(for_test=True)
-
-    # prepare predicting
-
-    ## define the executor and program for training
-
-    exe = fluid.Executor(place)
-
-    exe.run(startup_prog)
-    assert (args.init_from_params) or (args.init_from_pretrain_model)
-
-    if args.init_from_params:
-        init_from_params(args, exe, test_prog)
-
-    elif args.init_from_pretrain_model:
-        init_from_pretrain_model(args, exe, test_prog)
-
-    # to avoid a longer length than training, reset the size of position encoding to max_length
-    for pos_enc_param_name in desc.pos_enc_param_names:
-        pos_enc_param = fluid.global_scope().find_var(
-            pos_enc_param_name).get_tensor()
-
-        pos_enc_param.set(
-            position_encoding_init(args.max_length + 1, args.d_model), place)
-
-    exe_strategy = fluid.ExecutionStrategy()
-    # to clear tensor array after each iteration
-    exe_strategy.num_iteration_per_drop_scope = 1
-    compiled_test_prog = fluid.CompiledProgram(test_prog).with_data_parallel(
-        exec_strategy=exe_strategy, places=place)
-
-    f = open(args.output_file, "wb")
-    # start predicting
-    ## decorate the pyreader with batch_generator
-    input_field.loader.set_batch_generator(batch_generator)
-    input_field.loader.start()
-    while True:
-        try:
-            seq_ids, seq_scores = exe.run(
-                compiled_test_prog,
-                fetch_list=[out_ids.name, out_scores.name],
-                return_numpy=False)
-
-            # How to parse the results:
-            #   Suppose the lod of seq_ids is:
-            #     [[0, 3, 6], [0, 12, 24, 40, 54, 67, 82]]
-            #   then from lod[0]:
-            #     there are 2 source sentences, beam width is 3.
-            #   from lod[1]:
-            #     the first source sentence has 3 hyps; the lengths are 12, 12, 16
-            #     the second source sentence has 3 hyps; the lengths are 14, 13, 15
-            hyps = [[] for i in range(len(seq_ids.lod()[0]) - 1)]
-            scores = [[] for i in range(len(seq_scores.lod()[0]) - 1)]
-            for i in range(len(seq_ids.lod()[0]) -
-                           1):  # for each source sentence
-                start = seq_ids.lod()[0][i]
-                end = seq_ids.lod()[0][i + 1]
-                for j in range(end - start):  # for each candidate
-                    sub_start = seq_ids.lod()[1][start + j]
-                    sub_end = seq_ids.lod()[1][start + j + 1]
-                    hyps[i].append(b" ".join([
-                        trg_idx2word[idx]
-                        for idx in post_process_seq(
-                            np.array(seq_ids)[sub_start:sub_end], args.bos_idx,
-                            args.eos_idx)
-                    ]))
-                    scores[i].append(np.array(seq_scores)[sub_end - 1])
-                    f.write(hyps[i][-1] + b"\n")
-                    if len(hyps[i]) >= args.n_best:
-                        break
-        except fluid.core.EOFException:
-            break
-
-    f.close()
-
-
-if __name__ == "__main__":
-    args = PDConfig(yaml_file="./transformer.yaml")
-    args.build()
-    args.Print()
-    check_gpu(args.use_cuda)
-    check_version()
-
-    do_predict(args)
diff --git a/PaddleNLP/neural_machine_translation/transformer/reader.py b/PaddleNLP/neural_machine_translation/transformer/reader.py
deleted file mode 100644
index e69b4a252be9f7a2d684286cd6fb9009128a1c3f..0000000000000000000000000000000000000000
--- a/PaddleNLP/neural_machine_translation/transformer/reader.py
+++ /dev/null
@@ -1,567 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import glob
-import six
-import os
-import tarfile
-
-import numpy as np
-import paddle.fluid as fluid
-
-
-def pad_batch_data(insts,
-                   pad_idx,
-                   n_head,
-                   is_target=False,
-                   is_label=False,
-                   return_attn_bias=True,
-                   return_max_len=True,
-                   return_num_token=False):
-    """
-    Pad the instances to the max sequence length in batch, and generate the
-    corresponding position data and attention bias.
-    """
-    return_list = []
-    max_len = max(len(inst) for inst in insts)
-    # Any token included in dict can be used to pad, since the paddings' loss
-    # will be masked out by weights and make no effect on parameter gradients.
-    inst_data = np.array(
-        [inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
-    return_list += [inst_data.astype("int64").reshape([-1, 1])]
-    if is_label:  # label weight
-        inst_weight = np.array([[1.] * len(inst) + [0.] * (max_len - len(inst))
-                                for inst in insts])
-        return_list += [inst_weight.astype("float32").reshape([-1, 1])]
-    else:  # position data
-        inst_pos = np.array([
-            list(range(0, len(inst))) + [0] * (max_len - len(inst))
-            for inst in insts
-        ])
-        return_list += [inst_pos.astype("int64").reshape([-1, 1])]
-    if return_attn_bias:
-        if is_target:
-            # This is used to avoid attention on paddings and subsequent
-            # words.
-            slf_attn_bias_data = np.ones((inst_data.shape[0], max_len, max_len))
-            slf_attn_bias_data = np.triu(slf_attn_bias_data,
-                                         1).reshape([-1, 1, max_len, max_len])
-            slf_attn_bias_data = np.tile(slf_attn_bias_data,
-                                         [1, n_head, 1, 1]) * [-1e9]
-        else:
-            # This is used to avoid attention on paddings.
-            slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] *
-                                           (max_len - len(inst))
-                                           for inst in insts])
-            slf_attn_bias_data = np.tile(
-                slf_attn_bias_data.reshape([-1, 1, 1, max_len]),
-                [1, n_head, max_len, 1])
-        return_list += [slf_attn_bias_data.astype("float32")]
-    if return_max_len:
-        return_list += [max_len]
-    if return_num_token:
-        num_token = 0
-        for inst in insts:
-            num_token += len(inst)
-        return_list += [num_token]
-    return return_list if len(return_list) > 1 else return_list[0]
-
-
-def prepare_train_input(insts, src_pad_idx, trg_pad_idx, n_head):
-    """
-    Put all padded data needed by training into a list.
-    """
-    src_word, src_pos, src_slf_attn_bias, src_max_len = pad_batch_data(
-        [inst[0] for inst in insts], src_pad_idx, n_head, is_target=False)
-    src_word = src_word.reshape(-1, src_max_len)
-    src_pos = src_pos.reshape(-1, src_max_len)
-    trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = pad_batch_data(
-        [inst[1] for inst in insts], trg_pad_idx, n_head, is_target=True)
-    trg_word = trg_word.reshape(-1, trg_max_len)
-    trg_pos = trg_pos.reshape(-1, trg_max_len)
-
-    trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
-                                [1, 1, trg_max_len, 1]).astype("float32")
-
-    lbl_word, lbl_weight, num_token = pad_batch_data(
-        [inst[2] for inst in insts],
-        trg_pad_idx,
-        n_head,
-        is_target=False,
-        is_label=True,
-        return_attn_bias=False,
-        return_max_len=False,
-        return_num_token=True)
-    lbl_word = lbl_word.reshape(-1, 1)
-    lbl_weight = lbl_weight.reshape(-1, 1)
-
-    data_inputs = [
-        src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos,
-        trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
-    ]
-
-    return data_inputs
-
-
-def prepare_infer_input(insts, src_pad_idx, bos_idx, n_head, place):
-    """
-    Put all padded data needed by beam search decoder into a list.
-    """
-    src_word, src_pos, src_slf_attn_bias, src_max_len = pad_batch_data(
-        [inst[0] for inst in insts], src_pad_idx, n_head, is_target=False)
-    # start tokens
-    trg_word = np.asarray([[bos_idx]] * len(insts), dtype="int64")
-    trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
-                                [1, 1, 1, 1]).astype("float32")
-    trg_word = trg_word.reshape(-1, 1)
-    src_word = src_word.reshape(-1, src_max_len)
-    src_pos = src_pos.reshape(-1, src_max_len)
-
-    def to_lodtensor(data, place, lod=None):
-        data_tensor = fluid.LoDTensor()
-        data_tensor.set(data, place)
-        if lod is not None:
-            data_tensor.set_lod(lod)
-        return data_tensor
-
-    # beamsearch_op must use tensors with lod
-    init_score = to_lodtensor(
-        np.zeros_like(
-            trg_word, dtype="float32").reshape(-1, 1),
-        place, [range(trg_word.shape[0] + 1)] * 2)
-    trg_word = to_lodtensor(trg_word, place, [range(trg_word.shape[0] + 1)] * 2)
-    init_idx = np.asarray(range(len(insts)), dtype="int32")
-
-    data_inputs = [
-        src_word, src_pos, src_slf_attn_bias, trg_word, init_score, init_idx,
-        trg_src_attn_bias
-    ]
-    return data_inputs
-
-
-class SortType(object):
-    GLOBAL = 'global'
-    POOL = 'pool'
-    NONE = "none"
-
-
-class Converter(object):
-    def __init__(self, vocab, beg, end, unk, delimiter, add_beg):
-        self._vocab = vocab
-        self._beg = beg
-        self._end = end
-        self._unk = unk
-        self._delimiter = delimiter
-        self._add_beg = add_beg
-
-    def __call__(self, sentence):
-        return ([self._beg] if self._add_beg else []) + [
-            self._vocab.get(w, self._unk)
-            for w in sentence.split(self._delimiter)
-        ] + [self._end]
-
-
-class ComposedConverter(object):
-    def __init__(self, converters):
-        self._converters = converters
-
-    def __call__(self, parallel_sentence):
-        return [
-            self._converters[i](parallel_sentence[i])
-            for i in range(len(self._converters))
-        ]
-
-
-class SentenceBatchCreator(object):
-    def __init__(self, batch_size):
-        self.batch = []
-        self._batch_size = batch_size
-
-    def append(self, info):
-        self.batch.append(info)
-        if len(self.batch) == self._batch_size:
-            tmp = self.batch
-            self.batch = []
-            return tmp
-
-
-class TokenBatchCreator(object):
-    def __init__(self, batch_size):
-        self.batch = []
-        self.max_len = -1
-        self._batch_size = batch_size
-
-    def append(self, info):
-        cur_len = info.max_len
-        max_len = max(self.max_len, cur_len)
-        if max_len * (len(self.batch) + 1) > self._batch_size:
-            result = self.batch
-            self.batch = [info]
-            self.max_len = cur_len
-            return result
-        else:
-            self.max_len = max_len
-            self.batch.append(info)
-
-
-class SampleInfo(object):
-    def __init__(self, i, max_len, min_len):
-        self.i = i
-        self.min_len = min_len
-        self.max_len = max_len
-
-
-class MinMaxFilter(object):
-    def __init__(self, max_len, min_len, underlying_creator):
-        self._min_len = min_len
-        self._max_len = max_len
-        self._creator = underlying_creator
-
-    def append(self, info):
-        if info.max_len > self._max_len or info.min_len < self._min_len:
-            return
-        else:
-            return self._creator.append(info)
-
-    @property
-    def batch(self):
-        return self._creator.batch
-
-
-class DataProcessor(object):
-    """
-    The data reader loads all data from files and produces batches of data
-    in the way corresponding to settings.
-
-    An example of returning a generator producing data batches whose data
-    is shuffled in each pass and sorted in each pool:
-
-    ```
-    train_data = DataProcessor(
-        src_vocab_fpath='data/src_vocab_file',
-        trg_vocab_fpath='data/trg_vocab_file',
-        fpattern='data/part-*',
-        use_token_batch=True,
-        batch_size=2000,
-        device_count=8,
-        n_head=8,
-        pool_size=10000,
-        sort_type=SortType.POOL,
-        shuffle=True,
-        shuffle_batch=True,
-        start_mark='<s>',
-        end_mark='<e>',
-        unk_mark='<unk>',
-        clip_last_batch=False).data_generator(phase='train')
-    ```
-
-    :param src_vocab_fpath: The path of vocabulary file of source language.
-    :type src_vocab_fpath: basestring
-    :param trg_vocab_fpath: The path of vocabulary file of target language.
-    :type trg_vocab_fpath: basestring
-    :param fpattern: The pattern to match data files.
-    :type fpattern: basestring
-    :param batch_size: The number of sequences contained in a mini-batch.
-        or the maximum number of tokens (include paddings) contained in a
-        mini-batch.
-    :type batch_size: int
-    :param pool_size: The size of pool buffer.
-    :type device_count: int
-    :param device_count: The number of devices. The actual batch size is
-        determined by both batch_size and device_count.
-    :type n_head: int
-    :param n_head: The number of head used in multi-head attention. Actually,
-        this is not a reader related argument, but is used for input data.
-    :type pool_size: int
-    :param sort_type: The grain to sort by length: 'global' for all
-        instances; 'pool' for instances in pool; 'none' for no sort.
-    :type sort_type: basestring
-    :param clip_last_batch: Whether to clip the last uncompleted batch.
-    :type clip_last_batch: bool
-    :param tar_fname: The data file in tar if fpattern matches a tar file.
-    :type tar_fname: basestring
-    :param min_length: The minimum length used to filt sequences.
-    :type min_length: int
-    :param max_length: The maximum length used to filt sequences.
-    :type max_length: int
-    :param shuffle: Whether to shuffle all instances.
-    :type shuffle: bool
-    :param shuffle_batch: Whether to shuffle the generated batches.
-    :type shuffle_batch: bool
-    :param use_token_batch: Whether to produce batch data according to
-        token number.
-    :type use_token_batch: bool
-    :param field_delimiter: The delimiter used to split source and target in
-        each line of data file.
-    :type field_delimiter: basestring
-    :param token_delimiter: The delimiter used to split tokens in source or
-        target sentences.
-    :type token_delimiter: basestring
-    :param start_mark: The token representing for the beginning of
-        sentences in dictionary.
-    :type start_mark: basestring
-    :param end_mark: The token representing for the end of sentences
-        in dictionary.
-    :type end_mark: basestring
-    :param unk_mark: The token representing for unknown word in dictionary.
-    :type unk_mark: basestring
-    :param only_src: Whether each line is a source and target sentence
-        pair or only has the source sentence.
-    :type only_src: bool
-    :param seed: The seed for random.
-    :type seed: int
-    """
-
-    def __init__(self,
-                 src_vocab_fpath,
-                 trg_vocab_fpath,
-                 fpattern,
-                 batch_size,
-                 device_count,
-                 n_head,
-                 pool_size,
-                 sort_type=SortType.GLOBAL,
-                 clip_last_batch=False,
-                 tar_fname=None,
-                 min_length=0,
-                 max_length=100,
-                 shuffle=True,
-                 shuffle_batch=False,
-                 use_token_batch=False,
-                 field_delimiter="\t",
-                 token_delimiter=" ",
-                 start_mark="<s>",
-                 end_mark="<e>",
-                 unk_mark="<unk>",
-                 only_src=False,
-                 seed=0):
-        # convert str to bytes, and use byte data
-        field_delimiter = field_delimiter.encode("utf8")
-        token_delimiter = token_delimiter.encode("utf8")
-        start_mark = start_mark.encode("utf8")
-        end_mark = end_mark.encode("utf8")
-        unk_mark = unk_mark.encode("utf8")
-        self._src_vocab = self.load_dict(src_vocab_fpath)
-        self._trg_vocab = self.load_dict(trg_vocab_fpath)
-        self._bos_idx = self._src_vocab[start_mark]
-        self._eos_idx = self._src_vocab[end_mark]
-        self._unk_idx = self._src_vocab[unk_mark]
-        self._only_src = only_src
-        self._pool_size = pool_size
-        self._batch_size = batch_size
-        self._device_count = device_count
-        self._n_head = n_head
-        self._use_token_batch = use_token_batch
-        self._sort_type = sort_type
-        self._clip_last_batch = clip_last_batch
-        self._shuffle = shuffle
-        self._shuffle_batch = shuffle_batch
-        self._min_length = min_length
-        self._max_length = max_length
-        self._field_delimiter = field_delimiter
-        self._token_delimiter = token_delimiter
-        self.load_src_trg_ids(fpattern, tar_fname)
-        self._random = np.random
-        self._random.seed(seed)
-
-    def load_src_trg_ids(self, fpattern, tar_fname):
-        converters = [
-            Converter(
-                vocab=self._src_vocab,
-                beg=self._bos_idx,
-                end=self._eos_idx,
-                unk=self._unk_idx,
-                delimiter=self._token_delimiter,
-                add_beg=False)
-        ]
-        if not self._only_src:
-            converters.append(
-                Converter(
-                    vocab=self._trg_vocab,
-                    beg=self._bos_idx,
-                    end=self._eos_idx,
-                    unk=self._unk_idx,
-                    delimiter=self._token_delimiter,
-                    add_beg=True))
-
-        converters = ComposedConverter(converters)
-
-        self._src_seq_ids = []
-        self._trg_seq_ids = None if self._only_src else []
-        self._sample_infos = []
-
-        for i, line in enumerate(self._load_lines(fpattern, tar_fname)):
-            src_trg_ids = converters(line)
-            self._src_seq_ids.append(src_trg_ids[0])
-            lens = [len(src_trg_ids[0])]
-            if not self._only_src:
-                self._trg_seq_ids.append(src_trg_ids[1])
-                lens.append(len(src_trg_ids[1]))
-            self._sample_infos.append(SampleInfo(i, max(lens), min(lens)))
-
-    def _load_lines(self, fpattern, tar_fname):
-        fpaths = glob.glob(fpattern)
-        assert len(fpaths) > 0, "no matching file to the provided data path"
-
-        if len(fpaths) == 1 and tarfile.is_tarfile(fpaths[0]):
-            if tar_fname is None:
-                raise Exception("If tar file provided, please set tar_fname.")
-
-            f = tarfile.open(fpaths[0], "rb")
-            for line in f.extractfile(tar_fname):
-                fields = line.strip(b"\n").split(self._field_delimiter)
-                if (not self._only_src and len(fields) == 2) or (
-                        self._only_src and len(fields) == 1):
-                    yield fields
-        else:
-            for fpath in fpaths:
-                if not os.path.isfile(fpath):
-                    raise IOError("Invalid file: %s" % fpath)
-
-                with open(fpath, "rb") as f:
-                    for line in f:
-                        fields = line.strip(b"\n").split(self._field_delimiter)
-                        if (not self._only_src and len(fields) == 2) or (
-                                self._only_src and len(fields) == 1):
-                            yield fields
-
-    @staticmethod
-    def load_dict(dict_path, reverse=False):
-        word_dict = {}
-        with open(dict_path, "rb") as fdict:
-            for idx, line in enumerate(fdict):
-                if reverse:
-                    word_dict[idx] = line.strip(b"\n")
-                else:
-                    word_dict[line.strip(b"\n")] = idx
-        return word_dict
-
-    def batch_generator(self, batch_size, use_token_batch):
-        def __impl__():
-            # global sort or global shuffle
-            if self._sort_type == SortType.GLOBAL:
-                infos = sorted(self._sample_infos, key=lambda x: x.max_len)
-            else:
-                if self._shuffle:
-                    infos = self._sample_infos
-                    self._random.shuffle(infos)
-                else:
-                    infos = self._sample_infos
-
-                if self._sort_type == SortType.POOL:
-                    reverse = True
-                    for i in range(0, len(infos), self._pool_size):
-                        # to avoid placing short next to long sentences
-                        reverse = not reverse
-                        infos[i:i + self._pool_size] = sorted(
-                            infos[i:i + self._pool_size],
-                            key=lambda x: x.max_len,
-                            reverse=reverse)
-
-            # concat batch
-            batches = []
-            batch_creator = TokenBatchCreator(
-                batch_size) if use_token_batch else SentenceBatchCreator(
-                    batch_size)
-            batch_creator = MinMaxFilter(self._max_length, self._min_length,
-                                         batch_creator)
-
-            for info in infos:
-                batch = batch_creator.append(info)
-                if batch is not None:
-                    batches.append(batch)
-
-            if not self._clip_last_batch and len(batch_creator.batch) != 0:
-                batches.append(batch_creator.batch)
-
-            if self._shuffle_batch:
-                self._random.shuffle(batches)
-
-            for batch in batches:
-                batch_ids = [info.i for info in batch]
-
-                if self._only_src:
-                    yield [[self._src_seq_ids[idx]] for idx in batch_ids]
-                else:
-                    yield [(self._src_seq_ids[idx], self._trg_seq_ids[idx][:-1],
-                            self._trg_seq_ids[idx][1:]) for idx in batch_ids]
-
-        return __impl__
-
-    @staticmethod
-    def stack(data_reader, count, clip_last=True):
-        def __impl__():
-            res = []
-            for item in data_reader():
-                res.append(item)
-                if len(res) == count:
-                    yield res
-                    res = []
-            if len(res) == count:
-                yield res
-            elif not clip_last:
-                data = []
-                for item in res:
-                    data += item
-                if len(data) > count:
-                    inst_num_per_part = len(data) // count
-                    yield [
-                        data[inst_num_per_part * i:inst_num_per_part * (i + 1)]
-                        for i in range(count)
-                    ]
-
-        return __impl__
-
-    @staticmethod
-    def split(data_reader, count):
-        def __impl__():
-            for item in data_reader():
-                inst_num_per_part = len(item) // count
-                for i in range(count):
-                    yield item[inst_num_per_part * i:inst_num_per_part * (i + 1
-                                                                          )]
-
-        return __impl__
-
-    def data_generator(self, phase, place=None):
-        # Any token included in dict can be used to pad, since the paddings' loss
-        # will be masked out by weights and make no effect on parameter gradients.
-        src_pad_idx = trg_pad_idx = self._eos_idx
-        bos_idx = self._bos_idx
-        n_head = self._n_head
-        data_reader = self.batch_generator(
-            self._batch_size *
-            (1 if self._use_token_batch else self._device_count),
-            self._use_token_batch)
-        if not self._use_token_batch:
-            # to make data on each device have similar token number
-            data_reader = self.split(data_reader, self._device_count)
-
-        def __for_train__():
-            for data in data_reader():
-                data_inputs = prepare_train_input(data, src_pad_idx,
-                                                  trg_pad_idx, n_head)
-                yield data_inputs
-
-        def __for_predict__():
-            for data in data_reader():
-                data_inputs = prepare_infer_input(data, src_pad_idx, bos_idx,
-                                                  n_head, place)
-                yield data_inputs
-
-        return __for_train__ if phase == "train" else __for_predict__
-
-    def get_vocab_summary(self):
-        return len(self._src_vocab), len(
-            self._trg_vocab), self._bos_idx, self._eos_idx, self._unk_idx
diff --git a/PaddleNLP/neural_machine_translation/transformer/train.py b/PaddleNLP/neural_machine_translation/transformer/train.py
deleted file mode 100644
index 48b4847f68e849b109133d2d413b03e456e9825c..0000000000000000000000000000000000000000
--- a/PaddleNLP/neural_machine_translation/transformer/train.py
+++ /dev/null
@@ -1,327 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-import six
-import sys
-import time
-
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-
-import utils.dist_utils as dist_utils
-from utils.input_field import InputField
-from utils.configure import PDConfig
-from utils.check import check_gpu, check_version
-
-# include task-specific libs
-import desc
-import reader
-from transformer import create_net, position_encoding_init
-
-if os.environ.get('FLAGS_eager_delete_tensor_gb', None) is None:
-    os.environ['FLAGS_eager_delete_tensor_gb'] = '0'
-# num_trainers is used for multi-process gpu training
-num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
-
-
-def init_from_pretrain_model(args, exe, program):
-
-    assert isinstance(args.init_from_pretrain_model, str)
-
-    if not os.path.exists(args.init_from_pretrain_model):
-        raise Warning("The pretrained params do not exist.")
-        return False
-
-    def existed_params(var):
-        if not isinstance(var, fluid.framework.Parameter):
-            return False
-        return os.path.exists(
-            os.path.join(args.init_from_pretrain_model, var.name))
-
-    fluid.io.load_vars(
-        exe,
-        args.init_from_pretrain_model,
-        main_program=program,
-        predicate=existed_params)
-
-    print("finish initing model from pretrained params from %s" %
-          (args.init_from_pretrain_model))
-
-    return True
-
-
-def init_from_checkpoint(args, exe, program):
-
-    assert isinstance(args.init_from_checkpoint, str)
-
-    if not os.path.exists(args.init_from_checkpoint):
-        raise Warning("the checkpoint path does not exist.")
-        return False
-
-    fluid.io.load_persistables(
-        executor=exe,
-        dirname=args.init_from_checkpoint,
-        main_program=program,
-        filename="checkpoint.pdckpt")
-
-    print("finish initing model from checkpoint from %s" %
-          (args.init_from_checkpoint))
-
-    return True
-
-
-def save_checkpoint(args, exe, program, dirname):
-
-    assert isinstance(args.save_model_path, str)
-
-    checkpoint_dir = os.path.join(args.save_model_path, args.save_checkpoint)
-
-    if not os.path.exists(checkpoint_dir):
-        os.mkdir(checkpoint_dir)
-
-    fluid.io.save_persistables(
-        exe,
-        os.path.join(checkpoint_dir, dirname),
-        main_program=program,
-        filename="checkpoint.pdparams")
-
-    print("save checkpoint at %s" % (os.path.join(checkpoint_dir, dirname)))
-
-    return True
-
-
-def save_param(args, exe, program, dirname):
-
-    assert isinstance(args.save_model_path, str)
-
-    param_dir = os.path.join(args.save_model_path, args.save_param)
-
-    if not os.path.exists(param_dir):
-        os.mkdir(param_dir)
-
-    fluid.io.save_params(
-        exe,
-        os.path.join(param_dir, dirname),
-        main_program=program,
-        filename="params.pdparams")
-    print("save parameters at %s" % (os.path.join(param_dir, dirname)))
-
-    return True
-
-
-def do_train(args):
-    if args.use_cuda:
-        if num_trainers > 1:  # for multi-process gpu training
-            dev_count = 1
-        else:
-            dev_count = fluid.core.get_cuda_device_count()
-        gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
-        place = fluid.CUDAPlace(gpu_id)
-    else:
-        dev_count = int(os.environ.get('CPU_NUM', 1))
-        place = fluid.CPUPlace()
-
-    # define the data generator
-    processor = reader.DataProcessor(
-        fpattern=args.training_file,
-        src_vocab_fpath=args.src_vocab_fpath,
-        trg_vocab_fpath=args.trg_vocab_fpath,
-        token_delimiter=args.token_delimiter,
-        use_token_batch=args.use_token_batch,
-        batch_size=args.batch_size,
-        device_count=dev_count,
-        pool_size=args.pool_size,
-        sort_type=args.sort_type,
-        shuffle=args.shuffle,
-        shuffle_batch=args.shuffle_batch,
-        start_mark=args.special_token[0],
-        end_mark=args.special_token[1],
-        unk_mark=args.special_token[2],
-        max_length=args.max_length,
-        n_head=args.n_head)
-    batch_generator = processor.data_generator(phase="train")
-    if num_trainers > 1:  # for multi-process gpu training
-        batch_generator = fluid.contrib.reader.distributed_batch_reader(
-            batch_generator)
-    args.src_vocab_size, args.trg_vocab_size, args.bos_idx, args.eos_idx, \
-        args.unk_idx = processor.get_vocab_summary()
-
-    train_prog = fluid.default_main_program()
-    startup_prog = fluid.default_startup_program()
-    random_seed = eval(str(args.random_seed))
-    if random_seed is not None:
-        train_prog.random_seed = random_seed
-        startup_prog.random_seed = random_seed
-
-    with fluid.program_guard(train_prog, startup_prog):
-        with fluid.unique_name.guard():
-
-            # define input and reader
-
-            input_field_names = desc.encoder_data_input_fields + \
-                    desc.decoder_data_input_fields[:-1] + desc.label_data_input_fields
-            input_slots = [{
-                "name": name,
-                "shape": desc.input_descs[name][0],
-                "dtype": desc.input_descs[name][1]
-            } for name in input_field_names]
-
-            input_field = InputField(input_slots)
-            input_field.build(build_pyreader=True)
-
-            # define the network
-
-            sum_cost, avg_cost, token_num = create_net(
-                is_training=True, model_input=input_field, args=args)
-
-            # define the optimizer
-
-            with fluid.default_main_program()._lr_schedule_guard():
-                learning_rate = fluid.layers.learning_rate_scheduler.noam_decay(
-                    args.d_model, args.warmup_steps) * args.learning_rate
-
-            optimizer = fluid.optimizer.Adam(
-                learning_rate=learning_rate,
-                beta1=args.beta1,
-                beta2=args.beta2,
-                epsilon=float(args.eps))
-            optimizer.minimize(avg_cost)
-
-    # prepare training
-
-    ## decorate the pyreader with batch_generator
-    input_field.loader.set_batch_generator(batch_generator)
-
-    ## define the executor and program for training
-
-    exe = fluid.Executor(place)
-
-    exe.run(startup_prog)
-    # init position_encoding
-    for pos_enc_param_name in desc.pos_enc_param_names:
-        pos_enc_param = fluid.global_scope().find_var(
-            pos_enc_param_name).get_tensor()
-
-        pos_enc_param.set(
-            position_encoding_init(args.max_length + 1, args.d_model), place)
-
-    assert (args.init_from_checkpoint == "") or (
-        args.init_from_pretrain_model == "")
-
-    ## init from some checkpoint, to resume the previous training
-    if args.init_from_checkpoint:
-        init_from_checkpoint(args, exe, train_prog)
-
-    ## init from some pretrain models, to better solve the current task
-    if args.init_from_pretrain_model:
-        init_from_pretrain_model(args, exe, train_prog)
-
-    build_strategy = fluid.compiler.BuildStrategy()
-    build_strategy.enable_inplace = True
-    exec_strategy = fluid.ExecutionStrategy()
-    if num_trainers > 1:
-        dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog)
-        exec_strategy.num_threads = 1
-
-    compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel(
-        loss_name=avg_cost.name,
-        build_strategy=build_strategy,
-        exec_strategy=exec_strategy)
-
-    # the best cross-entropy value with label smoothing
-    loss_normalizer = -(
-        (1. - args.label_smooth_eps) * np.log(
-            (1. - args.label_smooth_eps)) + args.label_smooth_eps *
-        np.log(args.label_smooth_eps / (args.trg_vocab_size - 1) + 1e-20))
-    # start training
-
-    step_idx = 0
-    for pass_id in range(args.epoch):
-        pass_start_time = time.time()
-        input_field.loader.start()
-
-        batch_id = 0
-        while True:
-            try:
-                outs = exe.run(compiled_train_prog,
-                               fetch_list=[sum_cost.name, token_num.name]
-                               if step_idx % args.print_step == 0 else [])
-
-                if step_idx % args.print_step == 0:
-                    sum_cost_val, token_num_val = np.array(outs[0]), np.array(
-                        outs[1])
-                    # sum the cost from multi-devices
-                    total_sum_cost = sum_cost_val.sum()
-                    total_token_num = token_num_val.sum()
-                    total_avg_cost = total_sum_cost / total_token_num
-
-                    if step_idx == 0:
-                        logging.info(
-                            "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, "
-                            "normalized loss: %f, ppl: %f" %
-                            (step_idx, pass_id, batch_id, total_avg_cost,
-                             total_avg_cost - loss_normalizer,
-                             np.exp([min(total_avg_cost, 100)])))
-                        avg_batch_time = time.time()
-                    else:
-                        logging.info(
-                            "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, "
-                            "normalized loss: %f, ppl: %f, speed: %.2f step/s" %
-                            (step_idx, pass_id, batch_id, total_avg_cost,
-                             total_avg_cost - loss_normalizer,
-                             np.exp([min(total_avg_cost, 100)]),
-                             args.print_step / (time.time() - avg_batch_time)))
-                        avg_batch_time = time.time()
-
-                if step_idx % args.save_step == 0 and step_idx != 0:
-
-                    if args.save_checkpoint:
-                        save_checkpoint(args, exe, train_prog,
-                                        "step_" + str(step_idx))
-
-                    if args.save_param:
-                        save_param(args, exe, train_prog,
-                                   "step_" + str(step_idx))
-
-                batch_id += 1
-                step_idx += 1
-
-            except fluid.core.EOFException:
-                input_field.loader.reset()
-                break
-
-        time_consumed = time.time() - pass_start_time
-
-    if args.save_checkpoint:
-        save_checkpoint(args, exe, train_prog, "step_final")
-
-    if args.save_param:
-        save_param(args, exe, train_prog, "step_final")
-
-    if args.enable_ce:  # For CE
-        print("kpis\ttrain_cost_card%d\t%f" % (dev_count, total_avg_cost))
-        print("kpis\ttrain_duration_card%d\t%f" % (dev_count, time_consumed))
-
-
-if __name__ == "__main__":
-    args = PDConfig(yaml_file="./transformer.yaml")
-    args.build()
-    args.Print()
-    check_gpu(args.use_cuda)
-    check_version()
-
-    do_train(args)
diff --git a/PaddleNLP/neural_machine_translation/transformer/transformer.py b/PaddleNLP/neural_machine_translation/transformer/transformer.py
deleted file mode 100644
index be20001b25fdb94fcc4bc234bae220413ddfacdd..0000000000000000000000000000000000000000
--- a/PaddleNLP/neural_machine_translation/transformer/transformer.py
+++ /dev/null
@@ -1,873 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from functools import partial
-import numpy as np
-
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-
-from desc import *
-
-# Set seed for CE or debug
-dropout_seed = None
-
-
-def wrap_layer_with_block(layer, block_idx):
-    """
-    Make layer define support indicating block, by which we can add layers
-    to other blocks within current block. This will make it easy to define
-    cache among while loop.
-    """
-
-    class BlockGuard(object):
-        """
-        BlockGuard class.
-
-        BlockGuard class is used to switch to the given block in a program by
-        using the Python `with` keyword.
-        """
-
-        def __init__(self, block_idx=None, main_program=None):
-            self.main_program = fluid.default_main_program(
-            ) if main_program is None else main_program
-            self.old_block_idx = self.main_program.current_block().idx
-            self.new_block_idx = block_idx
-
-        def __enter__(self):
-            self.main_program.current_block_idx = self.new_block_idx
-
-        def __exit__(self, exc_type, exc_val, exc_tb):
-            self.main_program.current_block_idx = self.old_block_idx
-            if exc_type is not None:
-                return False  # re-raise exception
-            return True
-
-    def layer_wrapper(*args, **kwargs):
-        with BlockGuard(block_idx):
-            return layer(*args, **kwargs)
-
-    return layer_wrapper
-
-
-def position_encoding_init(n_position, d_pos_vec):
-    """
-    Generate the initial values for the sinusoid position encoding table.
-    """
-    channels = d_pos_vec
-    position = np.arange(n_position)
-    num_timescales = channels // 2
-    log_timescale_increment = (np.log(float(1e4) / float(1)) /
-                               (num_timescales - 1))
-    inv_timescales = np.exp(np.arange(
-        num_timescales)) * -log_timescale_increment
-    scaled_time = np.expand_dims(position, 1) * np.expand_dims(inv_timescales,
-                                                               0)
-    signal = np.concatenate([np.sin(scaled_time), np.cos(scaled_time)], axis=1)
-    signal = np.pad(signal, [[0, 0], [0, np.mod(channels, 2)]], 'constant')
-    position_enc = signal
-    return position_enc.astype("float32")
-
-
-def multi_head_attention(queries,
-                         keys,
-                         values,
-                         attn_bias,
-                         d_key,
-                         d_value,
-                         d_model,
-                         n_head=1,
-                         dropout_rate=0.,
-                         cache=None,
-                         gather_idx=None,
-                         static_kv=False):
-    """
-    Multi-Head Attention. Note that attn_bias is added to the logit before
-    computing softmax activiation to mask certain selected positions so that
-    they will not considered in attention weights.
-    """
-    keys = queries if keys is None else keys
-    values = keys if values is None else values
-
-    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
-        raise ValueError(
-            "Inputs: quries, keys and values should all be 3-D tensors.")
-
-    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
-        """
-        Add linear projection to queries, keys, and values.
-        """
-        q = layers.fc(input=queries,
-                      size=d_key * n_head,
-                      bias_attr=False,
-                      num_flatten_dims=2)
-        # For encoder-decoder attention in inference, insert the ops and vars
-        # into global block to use as cache among beam search.
-        fc_layer = wrap_layer_with_block(
-            layers.fc, fluid.default_main_program().current_block(
-            ).parent_idx) if cache is not None and static_kv else layers.fc
-        k = fc_layer(
-            input=keys,
-            size=d_key * n_head,
-            bias_attr=False,
-            num_flatten_dims=2)
-        v = fc_layer(
-            input=values,
-            size=d_value * n_head,
-            bias_attr=False,
-            num_flatten_dims=2)
-        return q, k, v
-
-    def __split_heads_qkv(queries, keys, values, n_head, d_key, d_value):
-        """
-        Reshape input tensors at the last dimension to split multi-heads 
-        and then transpose. Specifically, transform the input tensor with shape
-        [bs, max_sequence_length, n_head * hidden_dim] to the output tensor
-        with shape [bs, n_head, max_sequence_length, hidden_dim].
-        """
-        # The value 0 in shape attr means copying the corresponding dimension
-        # size of the input as the output dimension size.
-        reshaped_q = layers.reshape(
-            x=queries, shape=[0, 0, n_head, d_key], inplace=True)
-        # permuate the dimensions into:
-        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
-        q = layers.transpose(x=reshaped_q, perm=[0, 2, 1, 3])
-        # For encoder-decoder attention in inference, insert the ops and vars
-        # into global block to use as cache among beam search.
-        reshape_layer = wrap_layer_with_block(
-            layers.reshape,
-            fluid.default_main_program().current_block(
-            ).parent_idx) if cache is not None and static_kv else layers.reshape
-        transpose_layer = wrap_layer_with_block(
-            layers.transpose,
-            fluid.default_main_program().current_block().
-            parent_idx) if cache is not None and static_kv else layers.transpose
-        reshaped_k = reshape_layer(
-            x=keys, shape=[0, 0, n_head, d_key], inplace=True)
-        k = transpose_layer(x=reshaped_k, perm=[0, 2, 1, 3])
-        reshaped_v = reshape_layer(
-            x=values, shape=[0, 0, n_head, d_value], inplace=True)
-        v = transpose_layer(x=reshaped_v, perm=[0, 2, 1, 3])
-
-        if cache is not None:  # only for faster inference
-            if static_kv:  # For encoder-decoder attention in inference
-                cache_k, cache_v = cache["static_k"], cache["static_v"]
-                # To init the static_k and static_v in cache.
-                # Maybe we can use condition_op(if_else) to do these at the first
-                # step in while loop to replace these, however it might be less
-                # efficient.
-                static_cache_init = wrap_layer_with_block(
-                    layers.assign,
-                    fluid.default_main_program().current_block().parent_idx)
-                static_cache_init(k, cache_k)
-                static_cache_init(v, cache_v)
-            else:  # For decoder self-attention in inference
-                cache_k, cache_v = cache["k"], cache["v"]
-            # gather cell states corresponding to selected parent
-            select_k = layers.gather(cache_k, index=gather_idx)
-            select_v = layers.gather(cache_v, index=gather_idx)
-            if not static_kv:
-                # For self attention in inference, use cache and concat time steps.
-                select_k = layers.concat([select_k, k], axis=2)
-                select_v = layers.concat([select_v, v], axis=2)
-            # update cell states(caches) cached in global block
-            layers.assign(select_k, cache_k)
-            layers.assign(select_v, cache_v)
-            return q, select_k, select_v
-        return q, k, v
-
-    def __combine_heads(x):
-        """
-        Transpose and then reshape the last two dimensions of inpunt tensor x
-        so that it becomes one dimension, which is reverse to __split_heads.
-        """
-        if len(x.shape) != 4:
-            raise ValueError("Input(x) should be a 4-D Tensor.")
-
-        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
-        # The value 0 in shape attr means copying the corresponding dimension
-        # size of the input as the output dimension size.
-        return layers.reshape(
-            x=trans_x,
-            shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
-            inplace=True)
-
-    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
-        """
-        Scaled Dot-Product Attention
-        """
-        product = layers.matmul(x=q, y=k, transpose_y=True, alpha=d_key**-0.5)
-        if attn_bias:
-            product += attn_bias
-        weights = layers.softmax(product)
-        if dropout_rate:
-            weights = layers.dropout(
-                weights,
-                dropout_prob=dropout_rate,
-                seed=dropout_seed,
-                is_test=False)
-        out = layers.matmul(weights, v)
-        return out
-
-    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
-    q, k, v = __split_heads_qkv(q, k, v, n_head, d_key, d_value)
-
-    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_model,
-                                                  dropout_rate)
-
-    out = __combine_heads(ctx_multiheads)
-
-    # Project back to the model size.
-    proj_out = layers.fc(input=out,
-                         size=d_model,
-                         bias_attr=False,
-                         num_flatten_dims=2)
-    return proj_out
-
-
-def positionwise_feed_forward(x, d_inner_hid, d_hid, dropout_rate):
-    """
-    Position-wise Feed-Forward Networks.
-    This module consists of two linear transformations with a ReLU activation
-    in between, which is applied to each position separately and identically.
-    """
-    hidden = layers.fc(input=x,
-                       size=d_inner_hid,
-                       num_flatten_dims=2,
-                       act="relu")
-    if dropout_rate:
-        hidden = layers.dropout(
-            hidden, dropout_prob=dropout_rate, seed=dropout_seed, is_test=False)
-    out = layers.fc(input=hidden, size=d_hid, num_flatten_dims=2)
-    return out
-
-
-def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.):
-    """
-    Add residual connection, layer normalization and droput to the out tensor
-    optionally according to the value of process_cmd.
-    This will be used before or after multi-head attention and position-wise
-    feed-forward networks.
-    """
-    for cmd in process_cmd:
-        if cmd == "a":  # add residual connection
-            out = out + prev_out if prev_out else out
-        elif cmd == "n":  # add layer normalization
-            out = layers.layer_norm(
-                out,
-                begin_norm_axis=len(out.shape) - 1,
-                param_attr=fluid.initializer.Constant(1.),
-                bias_attr=fluid.initializer.Constant(0.))
-        elif cmd == "d":  # add dropout
-            if dropout_rate:
-                out = layers.dropout(
-                    out,
-                    dropout_prob=dropout_rate,
-                    seed=dropout_seed,
-                    is_test=False)
-    return out
-
-
-pre_process_layer = partial(pre_post_process_layer, None)
-post_process_layer = pre_post_process_layer
-
-
-def prepare_encoder_decoder(src_word,
-                            src_pos,
-                            src_vocab_size,
-                            src_emb_dim,
-                            src_max_len,
-                            dropout_rate=0.,
-                            bos_idx=0,
-                            word_emb_param_name=None,
-                            pos_enc_param_name=None):
-    """Add word embeddings and position encodings.
-    The output tensor has a shape of:
-    [batch_size, max_src_length_in_batch, d_model].
-    This module is used at the bottom of the encoder stacks.
-    """
-    src_word_emb = fluid.embedding(
-        src_word,
-        size=[src_vocab_size, src_emb_dim],
-        padding_idx=bos_idx,  # set embedding of bos to 0
-        param_attr=fluid.ParamAttr(name=word_emb_param_name,
-                                   initializer=fluid.initializer.Normal(
-                                       0., src_emb_dim**-0.5)))
-
-    src_word_emb = layers.scale(x=src_word_emb, scale=src_emb_dim**0.5)
-    src_pos_enc = fluid.embedding(src_pos,
-                                  size=[src_max_len, src_emb_dim],
-                                  param_attr=fluid.ParamAttr(
-                                      name=pos_enc_param_name, trainable=False))
-    src_pos_enc.stop_gradient = True
-    enc_input = src_word_emb + src_pos_enc
-    return layers.dropout(
-        enc_input, dropout_prob=dropout_rate, seed=dropout_seed,
-        is_test=False) if dropout_rate else enc_input
-
-
-prepare_encoder = partial(
-    prepare_encoder_decoder, pos_enc_param_name=pos_enc_param_names[0])
-prepare_decoder = partial(
-    prepare_encoder_decoder, pos_enc_param_name=pos_enc_param_names[1])
-
-
-def encoder_layer(enc_input,
-                  attn_bias,
-                  n_head,
-                  d_key,
-                  d_value,
-                  d_model,
-                  d_inner_hid,
-                  prepostprocess_dropout,
-                  attention_dropout,
-                  relu_dropout,
-                  preprocess_cmd="n",
-                  postprocess_cmd="da"):
-    """The encoder layers that can be stacked to form a deep encoder.
-    This module consits of a multi-head (self) attention followed by
-    position-wise feed-forward networks and both the two components companied
-    with the post_process_layer to add residual connection, layer normalization
-    and droput.
-    """
-    attn_output = multi_head_attention(
-        pre_process_layer(enc_input, preprocess_cmd,
-                          prepostprocess_dropout), None, None, attn_bias, d_key,
-        d_value, d_model, n_head, attention_dropout)
-    attn_output = post_process_layer(enc_input, attn_output, postprocess_cmd,
-                                     prepostprocess_dropout)
-    ffd_output = positionwise_feed_forward(
-        pre_process_layer(attn_output, preprocess_cmd, prepostprocess_dropout),
-        d_inner_hid, d_model, relu_dropout)
-    return post_process_layer(attn_output, ffd_output, postprocess_cmd,
-                              prepostprocess_dropout)
-
-
-def encoder(enc_input,
-            attn_bias,
-            n_layer,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            preprocess_cmd="n",
-            postprocess_cmd="da"):
-    """
-    The encoder is composed of a stack of identical layers returned by calling
-    encoder_layer.
-    """
-    for i in range(n_layer):
-        enc_output = encoder_layer(
-            enc_input,
-            attn_bias,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            preprocess_cmd,
-            postprocess_cmd, )
-        enc_input = enc_output
-    enc_output = pre_process_layer(enc_output, preprocess_cmd,
-                                   prepostprocess_dropout)
-    return enc_output
-
-
-def decoder_layer(dec_input,
-                  enc_output,
-                  slf_attn_bias,
-                  dec_enc_attn_bias,
-                  n_head,
-                  d_key,
-                  d_value,
-                  d_model,
-                  d_inner_hid,
-                  prepostprocess_dropout,
-                  attention_dropout,
-                  relu_dropout,
-                  preprocess_cmd,
-                  postprocess_cmd,
-                  cache=None,
-                  gather_idx=None):
-    """ The layer to be stacked in decoder part.
-    The structure of this module is similar to that in the encoder part except
-    a multi-head attention is added to implement encoder-decoder attention.
-    """
-    slf_attn_output = multi_head_attention(
-        pre_process_layer(dec_input, preprocess_cmd, prepostprocess_dropout),
-        None,
-        None,
-        slf_attn_bias,
-        d_key,
-        d_value,
-        d_model,
-        n_head,
-        attention_dropout,
-        cache=cache,
-        gather_idx=gather_idx)
-    slf_attn_output = post_process_layer(
-        dec_input,
-        slf_attn_output,
-        postprocess_cmd,
-        prepostprocess_dropout, )
-    enc_attn_output = multi_head_attention(
-        pre_process_layer(slf_attn_output, preprocess_cmd,
-                          prepostprocess_dropout),
-        enc_output,
-        enc_output,
-        dec_enc_attn_bias,
-        d_key,
-        d_value,
-        d_model,
-        n_head,
-        attention_dropout,
-        cache=cache,
-        gather_idx=gather_idx,
-        static_kv=True)
-    enc_attn_output = post_process_layer(
-        slf_attn_output,
-        enc_attn_output,
-        postprocess_cmd,
-        prepostprocess_dropout, )
-    ffd_output = positionwise_feed_forward(
-        pre_process_layer(enc_attn_output, preprocess_cmd,
-                          prepostprocess_dropout),
-        d_inner_hid,
-        d_model,
-        relu_dropout, )
-    dec_output = post_process_layer(
-        enc_attn_output,
-        ffd_output,
-        postprocess_cmd,
-        prepostprocess_dropout, )
-    return dec_output
-
-
-def decoder(dec_input,
-            enc_output,
-            dec_slf_attn_bias,
-            dec_enc_attn_bias,
-            n_layer,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            preprocess_cmd,
-            postprocess_cmd,
-            caches=None,
-            gather_idx=None):
-    """
-    The decoder is composed of a stack of identical decoder_layer layers.
-    """
-    for i in range(n_layer):
-        dec_output = decoder_layer(
-            dec_input,
-            enc_output,
-            dec_slf_attn_bias,
-            dec_enc_attn_bias,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            preprocess_cmd,
-            postprocess_cmd,
-            cache=None if caches is None else caches[i],
-            gather_idx=gather_idx)
-        dec_input = dec_output
-    dec_output = pre_process_layer(dec_output, preprocess_cmd,
-                                   prepostprocess_dropout)
-    return dec_output
-
-
-def transformer(model_input,
-                src_vocab_size,
-                trg_vocab_size,
-                max_length,
-                n_layer,
-                n_head,
-                d_key,
-                d_value,
-                d_model,
-                d_inner_hid,
-                prepostprocess_dropout,
-                attention_dropout,
-                relu_dropout,
-                preprocess_cmd,
-                postprocess_cmd,
-                weight_sharing,
-                label_smooth_eps,
-                bos_idx=0,
-                is_test=False):
-    if weight_sharing:
-        assert src_vocab_size == trg_vocab_size, (
-            "Vocabularies in source and target should be same for weight sharing."
-        )
-
-    enc_inputs = (model_input.src_word, model_input.src_pos,
-                  model_input.src_slf_attn_bias)
-    dec_inputs = (model_input.trg_word, model_input.trg_pos,
-                  model_input.trg_slf_attn_bias, model_input.trg_src_attn_bias)
-    label = model_input.lbl_word
-    weights = model_input.lbl_weight
-
-    enc_output = wrap_encoder(enc_inputs,
-                              src_vocab_size,
-                              max_length,
-                              n_layer,
-                              n_head,
-                              d_key,
-                              d_value,
-                              d_model,
-                              d_inner_hid,
-                              prepostprocess_dropout,
-                              attention_dropout,
-                              relu_dropout,
-                              preprocess_cmd,
-                              postprocess_cmd,
-                              weight_sharing,
-                              bos_idx=bos_idx)
-
-    predict = wrap_decoder(dec_inputs,
-                           trg_vocab_size,
-                           max_length,
-                           n_layer,
-                           n_head,
-                           d_key,
-                           d_value,
-                           d_model,
-                           d_inner_hid,
-                           prepostprocess_dropout,
-                           attention_dropout,
-                           relu_dropout,
-                           preprocess_cmd,
-                           postprocess_cmd,
-                           weight_sharing,
-                           enc_output=enc_output)
-
-    # Padding index do not contribute to the total loss. The weights is used to
-    # cancel padding index in calculating the loss.
-    if label_smooth_eps:
-        # TODO: use fluid.input.one_hot after softmax_with_cross_entropy removing
-        # the enforcement that the last dimension of label must be 1.
-        label = layers.label_smooth(label=layers.one_hot(input=label,
-                                                         depth=trg_vocab_size),
-                                    epsilon=label_smooth_eps)
-
-    cost = layers.softmax_with_cross_entropy(
-        logits=predict,
-        label=label,
-        soft_label=True if label_smooth_eps else False)
-    weighted_cost = layers.elementwise_mul(x=cost, y=weights, axis=0)
-    sum_cost = layers.reduce_sum(weighted_cost)
-    token_num = layers.reduce_sum(weights)
-    token_num.stop_gradient = True
-    avg_cost = sum_cost / token_num
-    return sum_cost, avg_cost, predict, token_num
-
-
-def wrap_encoder(enc_inputs,
-                 src_vocab_size,
-                 max_length,
-                 n_layer,
-                 n_head,
-                 d_key,
-                 d_value,
-                 d_model,
-                 d_inner_hid,
-                 prepostprocess_dropout,
-                 attention_dropout,
-                 relu_dropout,
-                 preprocess_cmd,
-                 postprocess_cmd,
-                 weight_sharing,
-                 bos_idx=0):
-    """
-    The wrapper assembles together all needed layers for the encoder.
-    """
-    src_word, src_pos, src_slf_attn_bias = enc_inputs
-    enc_input = prepare_encoder(
-        src_word,
-        src_pos,
-        src_vocab_size,
-        d_model,
-        max_length,
-        prepostprocess_dropout,
-        bos_idx=bos_idx,
-        word_emb_param_name=word_emb_param_names[0])
-    enc_output = encoder(
-        enc_input,
-        src_slf_attn_bias,
-        n_layer,
-        n_head,
-        d_key,
-        d_value,
-        d_model,
-        d_inner_hid,
-        prepostprocess_dropout,
-        attention_dropout,
-        relu_dropout,
-        preprocess_cmd,
-        postprocess_cmd, )
-    return enc_output
-
-
-def wrap_decoder(dec_inputs,
-                 trg_vocab_size,
-                 max_length,
-                 n_layer,
-                 n_head,
-                 d_key,
-                 d_value,
-                 d_model,
-                 d_inner_hid,
-                 prepostprocess_dropout,
-                 attention_dropout,
-                 relu_dropout,
-                 preprocess_cmd,
-                 postprocess_cmd,
-                 weight_sharing,
-                 enc_output=None,
-                 caches=None,
-                 gather_idx=None,
-                 bos_idx=0):
-    """
-    The wrapper assembles together all needed layers for the decoder.
-    """
-    trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs
-
-    dec_input = prepare_decoder(
-        trg_word,
-        trg_pos,
-        trg_vocab_size,
-        d_model,
-        max_length,
-        prepostprocess_dropout,
-        bos_idx=bos_idx,
-        word_emb_param_name=word_emb_param_names[0]
-        if weight_sharing else word_emb_param_names[1])
-    dec_output = decoder(
-        dec_input,
-        enc_output,
-        trg_slf_attn_bias,
-        trg_src_attn_bias,
-        n_layer,
-        n_head,
-        d_key,
-        d_value,
-        d_model,
-        d_inner_hid,
-        prepostprocess_dropout,
-        attention_dropout,
-        relu_dropout,
-        preprocess_cmd,
-        postprocess_cmd,
-        caches=caches,
-        gather_idx=gather_idx)
-    # Reshape to 2D tensor to use GEMM instead of BatchedGEMM
-    dec_output = layers.reshape(
-        dec_output, shape=[-1, dec_output.shape[-1]], inplace=True)
-    if weight_sharing:
-        predict = layers.matmul(
-            x=dec_output,
-            y=fluid.default_main_program().global_block().var(
-                word_emb_param_names[0]),
-            transpose_y=True)
-    else:
-        predict = layers.fc(input=dec_output,
-                            size=trg_vocab_size,
-                            bias_attr=False)
-    if dec_inputs is None:
-        # Return probs for independent decoder program.
-        predict = layers.softmax(predict)
-    return predict
-
-
-def fast_decode(model_input, src_vocab_size, trg_vocab_size, max_in_len,
-                n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
-                prepostprocess_dropout, attention_dropout, relu_dropout,
-                preprocess_cmd, postprocess_cmd, weight_sharing, beam_size,
-                max_out_len, bos_idx, eos_idx):
-    """
-    Use beam search to decode. Caches will be used to store states of history
-    steps which can make the decoding faster.
-    """
-    enc_inputs = (model_input.src_word, model_input.src_pos,
-                  model_input.src_slf_attn_bias)
-    dec_inputs = (model_input.trg_word, model_input.init_score,
-                  model_input.init_idx, model_input.trg_src_attn_bias)
-
-    enc_output = wrap_encoder(enc_inputs,
-                              src_vocab_size,
-                              max_in_len,
-                              n_layer,
-                              n_head,
-                              d_key,
-                              d_value,
-                              d_model,
-                              d_inner_hid,
-                              prepostprocess_dropout,
-                              attention_dropout,
-                              relu_dropout,
-                              preprocess_cmd,
-                              postprocess_cmd,
-                              weight_sharing,
-                              bos_idx=bos_idx)
-    start_tokens, init_scores, parent_idx, trg_src_attn_bias = dec_inputs
-
-    def beam_search():
-        max_len = layers.fill_constant(
-            shape=[1],
-            dtype=start_tokens.dtype,
-            value=max_out_len,
-            force_cpu=True)
-        step_idx = layers.fill_constant(
-            shape=[1], dtype=start_tokens.dtype, value=0, force_cpu=True)
-        cond = layers.less_than(x=step_idx, y=max_len)  # default force_cpu=True
-        while_op = layers.While(cond)
-        # array states will be stored for each step.
-        ids = layers.array_write(
-            layers.reshape(start_tokens, (-1, 1)), step_idx)
-        scores = layers.array_write(init_scores, step_idx)
-        # cell states will be overwrited at each step.
-        # caches contains states of history steps in decoder self-attention
-        # and static encoder output projections in encoder-decoder attention
-        # to reduce redundant computation.
-        caches = [
-            {
-                "k":  # for self attention
-                layers.fill_constant_batch_size_like(
-                    input=start_tokens,
-                    shape=[-1, n_head, 0, d_key],
-                    dtype=enc_output.dtype,
-                    value=0),
-                "v":  # for self attention
-                layers.fill_constant_batch_size_like(
-                    input=start_tokens,
-                    shape=[-1, n_head, 0, d_value],
-                    dtype=enc_output.dtype,
-                    value=0),
-                "static_k":  # for encoder-decoder attention
-                layers.create_tensor(dtype=enc_output.dtype),
-                "static_v":  # for encoder-decoder attention
-                layers.create_tensor(dtype=enc_output.dtype)
-            } for i in range(n_layer)
-        ]
-
-        with while_op.block():
-            pre_ids = layers.array_read(array=ids, i=step_idx)
-            # Since beam_search_op dosen't enforce pre_ids' shape, we can do
-            # inplace reshape here which actually change the shape of pre_ids.
-            # pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
-            pre_scores = layers.array_read(array=scores, i=step_idx)
-            # gather cell states corresponding to selected parent
-            pre_src_attn_bias = layers.gather(
-                trg_src_attn_bias, index=parent_idx)
-            pre_pos = layers.elementwise_mul(
-                x=layers.fill_constant_batch_size_like(
-                    input=pre_src_attn_bias,  # cann't use lod tensor here
-                    value=1,
-                    shape=[-1, 1],
-                    dtype=pre_ids.dtype),
-                y=step_idx,
-                axis=0)
-            logits = wrap_decoder((pre_ids, pre_pos, None, pre_src_attn_bias),
-                                  trg_vocab_size,
-                                  max_in_len,
-                                  n_layer,
-                                  n_head,
-                                  d_key,
-                                  d_value,
-                                  d_model,
-                                  d_inner_hid,
-                                  prepostprocess_dropout,
-                                  attention_dropout,
-                                  relu_dropout,
-                                  preprocess_cmd,
-                                  postprocess_cmd,
-                                  weight_sharing,
-                                  enc_output=enc_output,
-                                  caches=caches,
-                                  gather_idx=parent_idx,
-                                  bos_idx=bos_idx)
-            # intra-beam topK
-            topk_scores, topk_indices = layers.topk(
-                input=layers.softmax(logits), k=beam_size)
-            accu_scores = layers.elementwise_add(
-                x=layers.log(topk_scores), y=pre_scores, axis=0)
-            # beam_search op uses lod to differentiate branches.
-            accu_scores = layers.lod_reset(accu_scores, pre_ids)
-            # topK reduction across beams, also contain special handle of
-            # end beams and end sentences(batch reduction)
-            selected_ids, selected_scores, gather_idx = layers.beam_search(
-                pre_ids=pre_ids,
-                pre_scores=pre_scores,
-                ids=topk_indices,
-                scores=accu_scores,
-                beam_size=beam_size,
-                end_id=eos_idx,
-                return_parent_idx=True)
-            layers.increment(x=step_idx, value=1.0, in_place=True)
-            # cell states(caches) have been updated in wrap_decoder,
-            # only need to update beam search states here.
-            layers.array_write(selected_ids, i=step_idx, array=ids)
-            layers.array_write(selected_scores, i=step_idx, array=scores)
-            layers.assign(gather_idx, parent_idx)
-            layers.assign(pre_src_attn_bias, trg_src_attn_bias)
-            length_cond = layers.less_than(x=step_idx, y=max_len)
-            finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
-            layers.logical_and(x=length_cond, y=finish_cond, out=cond)
-
-        finished_ids, finished_scores = layers.beam_search_decode(
-            ids, scores, beam_size=beam_size, end_id=eos_idx)
-        return finished_ids, finished_scores
-
-    finished_ids, finished_scores = beam_search()
-    return finished_ids, finished_scores
-
-
-def create_net(is_training, model_input, args):
-    if is_training:
-        sum_cost, avg_cost, _, token_num = transformer(
-            model_input, args.src_vocab_size, args.trg_vocab_size,
-            args.max_length + 1, args.n_layer, args.n_head, args.d_key,
-            args.d_value, args.d_model, args.d_inner_hid,
-            args.prepostprocess_dropout, args.attention_dropout,
-            args.relu_dropout, args.preprocess_cmd, args.postprocess_cmd,
-            args.weight_sharing, args.label_smooth_eps, args.bos_idx)
-        return sum_cost, avg_cost, token_num
-    else:
-        out_ids, out_scores = fast_decode(
-            model_input, args.src_vocab_size, args.trg_vocab_size,
-            args.max_length + 1, args.n_layer, args.n_head, args.d_key,
-            args.d_value, args.d_model, args.d_inner_hid,
-            args.prepostprocess_dropout, args.attention_dropout,
-            args.relu_dropout, args.preprocess_cmd, args.postprocess_cmd,
-            args.weight_sharing, args.beam_size, args.max_out_len, args.bos_idx,
-            args.eos_idx)
-        return out_ids, out_scores
diff --git a/PaddleNLP/neural_machine_translation/transformer/transformer.yaml b/PaddleNLP/neural_machine_translation/transformer/transformer.yaml
deleted file mode 100644
index c6cbc074ed8a76c8b4d649e7631f0c125e165511..0000000000000000000000000000000000000000
--- a/PaddleNLP/neural_machine_translation/transformer/transformer.yaml
+++ /dev/null
@@ -1,111 +0,0 @@
-# used for continuous evaluation
-enable_ce: False
-
-# The frequency to save trained models when training.
-save_step: 10000
-# The frequency to fetch and print output when training.
-print_step: 100
-# path of the checkpoint, to resume the previous training
-init_from_checkpoint: ""
-# path of the pretrain model, to better solve the current task
-init_from_pretrain_model: ""
-# path of trained parameter, to make prediction
-init_from_params: "trained_params/step_100000"
-save_model_path: ""
-# the directory for saving checkpoints.
-save_checkpoint: "trained_ckpts"
-# the directory for saving trained parameters.
-save_param: "trained_params"
-# the directory for saving inference model.
-inference_model_dir: "infer_model"
-# Set seed for CE or debug
-random_seed: None
-# The pattern to match training data files.
-training_file: "wmt16_ende_data_bpe/train.tok.clean.bpe.32000.en-de"
-# The pattern to match test data files.
-predict_file: "wmt16_ende_data_bpe/newstest2016.tok.bpe.32000.en-de"
-# The file to output the translation results of predict_file to.
-output_file: "predict.txt"
-# The path of vocabulary file of source language.
-src_vocab_fpath: "wmt16_ende_data_bpe/vocab_all.bpe.32000"
-# The path of vocabulary file of target language.
-trg_vocab_fpath: "wmt16_ende_data_bpe/vocab_all.bpe.32000"
-# The <bos>, <eos> and <unk> tokens in the dictionary.
-special_token: ["<s>", "<e>", "<unk>"]
-
-# whether to use cuda
-use_cuda: True
-
-# args for reader, see reader.py for details
-token_delimiter: " "
-use_token_batch: True
-pool_size: 200000
-sort_type: "pool"
-shuffle: True
-shuffle_batch: True
-batch_size: 4096
-
-# Hyparams for training:
-# the number of epoches for training
-epoch: 30
-# the hyper parameters for Adam optimizer.
-# This static learning_rate will be multiplied to the LearningRateScheduler
-# derived learning rate the to get the final learning rate.
-learning_rate: 2.0
-beta1: 0.9
-beta2: 0.997
-eps: 1e-9
-# the parameters for learning rate scheduling.
-warmup_steps: 8000
-# the weight used to mix up the ground-truth distribution and the fixed
-# uniform distribution in label smoothing when training.
-# Set this as zero if label smoothing is not wanted.
-label_smooth_eps: 0.1
-
-# Hyparams for generation:
-# the parameters for beam search.
-beam_size: 5
-max_out_len: 256
-# the number of decoded sentences to output.
-n_best: 1
-
-# Hyparams for model:
-# These following five vocabularies related configurations will be set
-# automatically according to the passed vocabulary path and special tokens.
-# size of source word dictionary.
-src_vocab_size: 10000
-# size of target word dictionay
-trg_vocab_size: 10000
-# index for <bos> token
-bos_idx: 0
-# index for <eos> token
-eos_idx: 1
-# index for <unk> token
-unk_idx: 2
-# max length of sequences deciding the size of position encoding table.
-max_length: 256
-# the dimension for word embeddings, which is also the last dimension of
-# the input and output of multi-head attention, position-wise feed-forward
-# networks, encoder and decoder.
-d_model: 512
-# size of the hidden layer in position-wise feed-forward networks.
-d_inner_hid: 2048
-# the dimension that keys are projected to for dot-product attention.
-d_key: 64
-# the dimension that values are projected to for dot-product attention.
-d_value: 64
-# number of head used in multi-head attention.
-n_head: 8
-# number of sub-layers to be stacked in the encoder and decoder.
-n_layer: 6
-# dropout rates of different modules.
-prepostprocess_dropout: 0.1
-attention_dropout: 0.1
-relu_dropout: 0.1
-# to process before each sub-layer
-preprocess_cmd: "n"  # layer normalization
-# to process after each sub-layer
-postprocess_cmd: "da"  # dropout + residual connection
-# the flag indicating whether to share embedding and softmax weights.
-# vocabularies in source and target should be same for weight sharing.
-weight_sharing: True
diff --git a/PaddleNLP/neural_machine_translation/transformer/utils/__init__.py b/PaddleNLP/neural_machine_translation/transformer/utils/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/PaddleNLP/neural_machine_translation/transformer/utils/check.py b/PaddleNLP/neural_machine_translation/transformer/utils/check.py
deleted file mode 100644
index 305fa3705f5c313569986cbdb15c8afeda5a79c1..0000000000000000000000000000000000000000
--- a/PaddleNLP/neural_machine_translation/transformer/utils/check.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import sys
-
-import paddle.fluid as fluid
-
-import logging
-logger = logging.getLogger(__name__)
-
-__all__ = ['check_gpu', 'check_version']
-
-
-def check_gpu(use_gpu):
-    """
-    Log error and exit when set use_gpu=true in paddlepaddle
-    cpu version.
-    """
-    err = "Config use_gpu cannot be set as true while you are " \
-          "using paddlepaddle cpu version ! \nPlease try: \n" \
-          "\t1. Install paddlepaddle-gpu to run model on GPU \n" \
-          "\t2. Set use_gpu as false in config file to run " \
-          "model on CPU"
-
-    try:
-        if use_gpu and not fluid.is_compiled_with_cuda():
-            logger.error(err)
-            sys.exit(1)
-    except Exception as e:
-        pass
-
-
-def check_version():
-    """
-    Log error and exit when the installed version of paddlepaddle is
-    not satisfied.
-    """
-    err = "PaddlePaddle version 1.6 or higher is required, " \
-          "or a suitable develop version is satisfied as well. \n" \
-          "Please make sure the version is good with your code." \
-
-    try:
-        fluid.require_version('1.6.0')
-    except Exception as e:
-        logger.error(err)
-        sys.exit(1)
diff --git a/PaddleNLP/neural_machine_translation/transformer/utils/configure.py b/PaddleNLP/neural_machine_translation/transformer/utils/configure.py
deleted file mode 100644
index 2ea9fd96817f461889d24cbbd0c5d9ae76585a0a..0000000000000000000000000000000000000000
--- a/PaddleNLP/neural_machine_translation/transformer/utils/configure.py
+++ /dev/null
@@ -1,345 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import sys
-import argparse
-import json
-import yaml
-import six
-import logging
-
-logging_only_message = "%(message)s"
-logging_details = "%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s"
-
-
-class JsonConfig(object):
-    """
-    A high-level api for handling json configure file.
-    """
-
-    def __init__(self, config_path):
-        self._config_dict = self._parse(config_path)
-
-    def _parse(self, config_path):
-        try:
-            with open(config_path) as json_file:
-                config_dict = json.load(json_file)
-        except:
-            raise IOError("Error in parsing bert model config file '%s'" %
-                          config_path)
-        else:
-            return config_dict
-
-    def __getitem__(self, key):
-        return self._config_dict[key]
-
-    def print_config(self):
-        for arg, value in sorted(six.iteritems(self._config_dict)):
-            print('%s: %s' % (arg, value))
-        print('------------------------------------------------')
-
-
-class ArgumentGroup(object):
-    def __init__(self, parser, title, des):
-        self._group = parser.add_argument_group(title=title, description=des)
-
-    def add_arg(self, name, type, default, help, **kwargs):
-        type = str2bool if type == bool else type
-        self._group.add_argument(
-            "--" + name,
-            default=default,
-            type=type,
-            help=help + ' Default: %(default)s.',
-            **kwargs)
-
-
-class ArgConfig(object):
-    """
-    A high-level api for handling argument configs.
-    """
-
-    def __init__(self):
-        parser = argparse.ArgumentParser()
-
-        train_g = ArgumentGroup(parser, "training", "training options.")
-        train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.")
-        train_g.add_arg("learning_rate", float, 5e-5,
-                        "Learning rate used to train with warmup.")
-        train_g.add_arg(
-            "lr_scheduler",
-            str,
-            "linear_warmup_decay",
-            "scheduler of learning rate.",
-            choices=['linear_warmup_decay', 'noam_decay'])
-        train_g.add_arg("weight_decay", float, 0.01,
-                        "Weight decay rate for L2 regularizer.")
-        train_g.add_arg(
-            "warmup_proportion", float, 0.1,
-            "Proportion of training steps to perform linear learning rate warmup for."
-        )
-        train_g.add_arg("save_steps", int, 1000,
-                        "The steps interval to save checkpoints.")
-        train_g.add_arg("use_fp16", bool, False,
-                        "Whether to use fp16 mixed precision training.")
-        train_g.add_arg(
-            "loss_scaling", float, 1.0,
-            "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled."
-        )
-        train_g.add_arg("pred_dir", str, None,
-                        "Path to save the prediction results")
-
-        log_g = ArgumentGroup(parser, "logging", "logging related.")
-        log_g.add_arg("skip_steps", int, 10,
-                      "The steps interval to print loss.")
-        log_g.add_arg("verbose", bool, False, "Whether to output verbose log.")
-
-        run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
-        run_type_g.add_arg("use_cuda", bool, True,
-                           "If set, use GPU for training.")
-        run_type_g.add_arg(
-            "use_fast_executor", bool, False,
-            "If set, use fast parallel executor (in experiment).")
-        run_type_g.add_arg(
-            "num_iteration_per_drop_scope", int, 1,
-            "Ihe iteration intervals to clean up temporary variables.")
-        run_type_g.add_arg("do_train", bool, True,
-                           "Whether to perform training.")
-        run_type_g.add_arg("do_predict", bool, True,
-                           "Whether to perform prediction.")
-
-        custom_g = ArgumentGroup(parser, "customize", "customized options.")
-
-        self.custom_g = custom_g
-
-        self.parser = parser
-
-    def add_arg(self, name, dtype, default, descrip):
-        self.custom_g.add_arg(name, dtype, default, descrip)
-
-    def build_conf(self):
-        return self.parser.parse_args()
-
-
-def str2bool(v):
-    # because argparse does not support to parse "true, False" as python
-    # boolean directly
-    return v.lower() in ("true", "t", "1")
-
-
-def print_arguments(args, log=None):
-    if not log:
-        print('-----------  Configuration Arguments -----------')
-        for arg, value in sorted(six.iteritems(vars(args))):
-            print('%s: %s' % (arg, value))
-        print('------------------------------------------------')
-    else:
-        log.info('-----------  Configuration Arguments -----------')
-        for arg, value in sorted(six.iteritems(vars(args))):
-            log.info('%s: %s' % (arg, value))
-        log.info('------------------------------------------------')
-
-
-class PDConfig(object):
-    """
-    A high-level API for managing configuration files in PaddlePaddle.
-    Can jointly work with command-line-arugment, json files and yaml files.
-    """
-
-    def __init__(self, json_file="", yaml_file="", fuse_args=True):
-        """
-            Init funciton for PDConfig.
-            json_file: the path to the json configure file.
-            yaml_file: the path to the yaml configure file.
-            fuse_args: if fuse the json/yaml configs with argparse.
-        """
-        assert isinstance(json_file, str)
-        assert isinstance(yaml_file, str)
-
-        if json_file != "" and yaml_file != "":
-            raise Warning(
-                "json_file and yaml_file can not co-exist for now. please only use one configure file type."
-            )
-            return
-
-        self.args = None
-        self.arg_config = {}
-        self.json_config = {}
-        self.yaml_config = {}
-
-        parser = argparse.ArgumentParser()
-
-        self.default_g = ArgumentGroup(parser, "default", "default options.")
-        self.yaml_g = ArgumentGroup(parser, "yaml", "options from yaml.")
-        self.json_g = ArgumentGroup(parser, "json", "options from json.")
-        self.com_g = ArgumentGroup(parser, "custom", "customized options.")
-
-        self.default_g.add_arg("do_train", bool, False,
-                               "Whether to perform training.")
-        self.default_g.add_arg("do_predict", bool, False,
-                               "Whether to perform predicting.")
-        self.default_g.add_arg("do_eval", bool, False,
-                               "Whether to perform evaluating.")
-        self.default_g.add_arg("do_save_inference_model", bool, False,
-                               "Whether to perform model saving for inference.")
-
-        self.parser = parser
-
-        if json_file != "":
-            self.load_json(json_file, fuse_args=fuse_args)
-
-        if yaml_file:
-            self.load_yaml(yaml_file, fuse_args=fuse_args)
-
-    def load_json(self, file_path, fuse_args=True):
-
-        if not os.path.exists(file_path):
-            raise Warning("the json file %s does not exist." % file_path)
-            return
-
-        with open(file_path, "r") as fin:
-            self.json_config = json.loads(fin.read())
-            fin.close()
-
-        if fuse_args:
-            for name in self.json_config:
-                if isinstance(self.json_config[name], list):
-                    self.json_g.add_arg(
-                        name,
-                        type(self.json_config[name][0]),
-                        self.json_config[name],
-                        "This is from %s" % file_path,
-                        nargs=len(self.json_config[name]))
-                    continue
-                if not isinstance(self.json_config[name], int) \
-                    and not isinstance(self.json_config[name], float) \
-                    and not isinstance(self.json_config[name], str) \
-                    and not isinstance(self.json_config[name], bool):
-
-                    continue
-
-                self.json_g.add_arg(name,
-                                    type(self.json_config[name]),
-                                    self.json_config[name],
-                                    "This is from %s" % file_path)
-
-    def load_yaml(self, file_path, fuse_args=True):
-
-        if not os.path.exists(file_path):
-            raise Warning("the yaml file %s does not exist." % file_path)
-            return
-
-        with open(file_path, "r") as fin:
-            self.yaml_config = yaml.load(fin, Loader=yaml.SafeLoader)
-            fin.close()
-
-        if fuse_args:
-            for name in self.yaml_config:
-                if isinstance(self.yaml_config[name], list):
-                    self.yaml_g.add_arg(
-                        name,
-                        type(self.yaml_config[name][0]),
-                        self.yaml_config[name],
-                        "This is from %s" % file_path,
-                        nargs=len(self.yaml_config[name]))
-                    continue
-
-                if not isinstance(self.yaml_config[name], int) \
-                    and not isinstance(self.yaml_config[name], float) \
-                    and not isinstance(self.yaml_config[name], str) \
-                    and not isinstance(self.yaml_config[name], bool):
-
-                    continue
-
-                self.yaml_g.add_arg(name,
-                                    type(self.yaml_config[name]),
-                                    self.yaml_config[name],
-                                    "This is from %s" % file_path)
-
-    def build(self):
-        self.args = self.parser.parse_args()
-        self.arg_config = vars(self.args)
-
-    def __add__(self, new_arg):
-        assert isinstance(new_arg, list) or isinstance(new_arg, tuple)
-        assert len(new_arg) >= 3
-        assert self.args is None
-
-        name = new_arg[0]
-        dtype = new_arg[1]
-        dvalue = new_arg[2]
-        desc = new_arg[3] if len(
-            new_arg) == 4 else "Description is not provided."
-
-        self.com_g.add_arg(name, dtype, dvalue, desc)
-
-        return self
-
-    def __getattr__(self, name):
-        if name in self.arg_config:
-            return self.arg_config[name]
-
-        if name in self.json_config:
-            return self.json_config[name]
-
-        if name in self.yaml_config:
-            return self.yaml_config[name]
-
-        raise Warning("The argument %s is not defined." % name)
-
-    def Print(self):
-
-        print("-" * 70)
-        for name in self.arg_config:
-            print("%s:\t\t\t\t%s" % (str(name), str(self.arg_config[name])))
-
-        for name in self.json_config:
-            if name not in self.arg_config:
-                print("%s:\t\t\t\t%s" %
-                      (str(name), str(self.json_config[name])))
-
-        for name in self.yaml_config:
-            if name not in self.arg_config:
-                print("%s:\t\t\t\t%s" %
-                      (str(name), str(self.yaml_config[name])))
-
-        print("-" * 70)
-
-
-if __name__ == "__main__":
-    """
-    pd_config = PDConfig(json_file = "./test/bert_config.json")
-    pd_config.build()
-
-    print(pd_config.do_train)
-    print(pd_config.hidden_size)
-
-    pd_config = PDConfig(yaml_file = "./test/bert_config.yaml")
-    pd_config.build()
-
-    print(pd_config.do_train)
-    print(pd_config.hidden_size)
-    """
-
-    pd_config = PDConfig(yaml_file="./test/bert_config.yaml")
-    pd_config += ("my_age", int, 18, "I am forever 18.")
-    pd_config.build()
-
-    print(pd_config.do_train)
-    print(pd_config.hidden_size)
-    print(pd_config.my_age)
diff --git a/PaddleNLP/neural_machine_translation/transformer/utils/dist_utils.py b/PaddleNLP/neural_machine_translation/transformer/utils/dist_utils.py
deleted file mode 100644
index 503431029f0242d27473ae5d4d95834f99ef0f84..0000000000000000000000000000000000000000
--- a/PaddleNLP/neural_machine_translation/transformer/utils/dist_utils.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import os
-import paddle.fluid as fluid
-
-
-def nccl2_prepare(trainer_id, startup_prog, main_prog):
-    config = fluid.DistributeTranspilerConfig()
-    config.mode = "nccl2"
-    t = fluid.DistributeTranspiler(config=config)
-    t.transpile(
-        trainer_id,
-        trainers=os.environ.get('PADDLE_TRAINER_ENDPOINTS'),
-        current_endpoint=os.environ.get('PADDLE_CURRENT_ENDPOINT'),
-        startup_program=startup_prog,
-        program=main_prog)
-
-
-def prepare_for_multi_process(exe, build_strategy, train_prog):
-    # prepare for multi-process
-    trainer_id = int(os.environ.get('PADDLE_TRAINER_ID', 0))
-    num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
-    if num_trainers < 2: return
-    print("PADDLE_TRAINERS_NUM", num_trainers)
-    print("PADDLE_TRAINER_ID", trainer_id)
-    build_strategy.num_trainers = num_trainers
-    build_strategy.trainer_id = trainer_id
-    # NOTE(zcd): use multi processes to train the model,
-    # and each process use one GPU card.
-    startup_prog = fluid.Program()
-    nccl2_prepare(trainer_id, startup_prog, train_prog)
-    # the startup_prog are run two times, but it doesn't matter.
-    exe.run(startup_prog)
diff --git a/PaddleNLP/neural_machine_translation/transformer/utils/input_field.py b/PaddleNLP/neural_machine_translation/transformer/utils/input_field.py
deleted file mode 100644
index de56712399df446baf73707494cb6ec8e7566b25..0000000000000000000000000000000000000000
--- a/PaddleNLP/neural_machine_translation/transformer/utils/input_field.py
+++ /dev/null
@@ -1,182 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-from __future__ import division
-from __future__ import print_function
-
-import os
-import six
-import ast
-import copy
-
-import numpy as np
-import paddle.fluid as fluid
-
-
-class Placeholder(object):
-    def __init__(self):
-        self.shapes = []
-        self.dtypes = []
-        self.lod_levels = []
-        self.names = []
-
-    def __init__(self, input_shapes):
-
-        self.shapes = []
-        self.dtypes = []
-        self.lod_levels = []
-        self.names = []
-
-        for new_holder in input_shapes:
-            shape = new_holder[0]
-            dtype = new_holder[1]
-            lod_level = new_holder[2] if len(new_holder) >= 3 else 0
-            name = new_holder[3] if len(new_holder) >= 4 else ""
-
-            self.append_placeholder(
-                shape, dtype, lod_level=lod_level, name=name)
-
-    def append_placeholder(self, shape, dtype, lod_level=0, name=""):
-        self.shapes.append(shape)
-        self.dtypes.append(dtype)
-        self.lod_levels.append(lod_level)
-        self.names.append(name)
-
-    def build(self, capacity, reader_name, use_double_buffer=False):
-        pyreader = fluid.layers.py_reader(
-            capacity=capacity,
-            shapes=self.shapes,
-            dtypes=self.dtypes,
-            lod_levels=self.lod_levels,
-            name=reader_name,
-            use_double_buffer=use_double_buffer)
-
-        return [pyreader, fluid.layers.read_file(pyreader)]
-
-    def __add__(self, new_holder):
-        assert isinstance(new_holder, tuple) or isinstance(new_holder, list)
-        assert len(new_holder) >= 2
-
-        shape = new_holder[0]
-        dtype = new_holder[1]
-        lod_level = new_holder[2] if len(new_holder) >= 3 else 0
-        name = new_holder[3] if len(new_holder) >= 4 else ""
-
-        self.append_placeholder(shape, dtype, lod_level=lod_level, name=name)
-
-
-class InputField(object):
-    """
-    A high-level API for handling inputs in PaddlePaddle.
-    """
-
-    def __init__(self, input_slots=[]):
-
-        self.shapes = []
-        self.dtypes = []
-        self.names = []
-        self.lod_levels = []
-
-        self.input_slots = {}
-        self.feed_list_str = []
-        self.feed_list = []
-
-        self.loader = None
-
-        if input_slots:
-            for input_slot in input_slots:
-                self += input_slot
-
-    def __add__(self, input_slot):
-
-        if isinstance(input_slot, list) or isinstance(input_slot, tuple):
-            name = input_slot[0]
-            shape = input_slot[1]
-            dtype = input_slot[2]
-            lod_level = input_slot[3] if len(input_slot) == 4 else 0
-
-        if isinstance(input_slot, dict):
-            name = input_slot["name"]
-            shape = input_slot["shape"]
-            dtype = input_slot["dtype"]
-            lod_level = input_slot[
-                "lod_level"] if "lod_level" in input_slot else 0
-
-        self.shapes.append(shape)
-        self.dtypes.append(dtype)
-        self.names.append(name)
-        self.lod_levels.append(lod_level)
-
-        self.feed_list_str.append(name)
-
-        return self
-
-    def __getattr__(self, name):
-
-        if name not in self.input_slots:
-            raise Warning("the attr %s has not been defined yet." % name)
-            return None
-
-        return self.input_slots[name]
-
-    def build(self, build_pyreader=False, capacity=100, iterable=False):
-
-        for _name, _shape, _dtype, _lod_level in zip(
-                self.names, self.shapes, self.dtypes, self.lod_levels):
-            self.input_slots[_name] = fluid.data(
-                name=_name, shape=_shape, dtype=_dtype, lod_level=_lod_level)
-
-        for name in self.feed_list_str:
-            self.feed_list.append(self.input_slots[name])
-
-        self.loader = fluid.io.DataLoader.from_generator(
-            feed_list=self.feed_list,
-            capacity=capacity,
-            iterable=(not build_pyreader),
-            use_double_buffer=True)
-
-
-if __name__ == "__main__":
-
-    mnist_input_slots = [{
-        "name": "image",
-        "shape": (-1, 32, 32, 1),
-        "dtype": "int32"
-    }, {
-        "name": "label",
-        "shape": [-1, 1],
-        "dtype": "int64"
-    }]
-
-    input_field = InputField(mnist_input_slots)
-
-    input_field += {
-        "name": "large_image",
-        "shape": (-1, 64, 64, 1),
-        "dtype": "int32"
-    }
-    input_field += {
-        "name": "large_color_image",
-        "shape": (-1, 64, 64, 3),
-        "dtype": "int32"
-    }
-
-    input_field.build()
-
-    print(input_field.feed_list)
-
-    print(input_field.image)
-
-    print(input_field.large_color_image)