Merge pull request #48 from xyzhou-puck/master

refine text.py

Merge pull request #48 from xyzhou-puck/master
refine text.py
e0f5c55d · pkpk · GitHub · ed14907e · 1fac53aa · e0f5c55d
10 changed file
--- a/examples/bert/bert_classifier.py
+++ b/examples/bert/bert_classifier.py
@@ -16,14 +16,60 @@
 import paddle.fluid as fluid
 from hapi.metrics import Accuracy
 from hapi.configure import Config
+from hapi.text.bert import BertEncoder
+from paddle.fluid.dygraph import Linear, Layer
 from hapi.model import set_device, Model, SoftmaxWithCrossEntropy, Input
-from cls import ClsModelLayer
 import hapi.text.tokenizer.tokenization as tokenization
 from hapi.text.bert import Optimizer, BertConfig, BertDataLoader, BertInputExample
-def train():
+class ClsModelLayer(Model):
+    """
+    classify model
+    """
+    def __init__(self,
+                 args,
+                 config,
+                 num_labels,
+                 return_pooled_out=True,
+                 use_fp16=False):
+        super(ClsModelLayer, self).__init__()
+        self.config = config
+        self.use_fp16 = use_fp16
+        self.loss_scaling = args.loss_scaling
+        self.bert_layer = BertEncoder(
+            config=self.config, return_pooled_out=True, use_fp16=self.use_fp16)
+        self.cls_fc = Linear(
+            input_dim=self.config["hidden_size"],
+            output_dim=num_labels,
+            param_attr=fluid.ParamAttr(
+                name="cls_out_w",
+                initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
+            bias_attr=fluid.ParamAttr(
+                name="cls_out_b", initializer=fluid.initializer.Constant(0.)))
+    def forward(self, src_ids, position_ids, sentence_ids, input_mask):
+        """
+        forward
+        """
+        enc_output, next_sent_feat = self.bert_layer(src_ids, position_ids,
+                                                     sentence_ids, input_mask)
+        cls_feats = fluid.layers.dropout(
+            x=next_sent_feat,
+            dropout_prob=0.1,
+            dropout_implementation="upscale_in_train")
+        pred = self.cls_fc(cls_feats)
+        return pred
+def main():
    config = Config(yaml_file="./bert.yaml")
    config.build()
@@ -35,8 +81,6 @@ def train():
    bert_config = BertConfig(config.bert_config_path)
    bert_config.print_config()
-    trainer_count = fluid.dygraph.parallel.Env().nranks
    tokenizer = tokenization.FullTokenizer(
        vocab_file=config.vocab_path, do_lower_case=config.do_lower_case)
@@ -52,14 +96,24 @@ def train():
        return BertInputExample(
            uid=uid, text_a=text_a, text_b=text_b, label=label)
-    bert_dataloader = BertDataLoader(
+    train_dataloader = BertDataLoader(
        "./data/glue_data/MNLI/train.tsv",
        tokenizer, ["contradiction", "entailment", "neutral"],
-        max_seq_length=64,
+        max_seq_length=config.max_seq_len,
-        batch_size=32,
+        batch_size=config.batch_size,
        line_processor=mnli_line_processor)
-    num_train_examples = len(bert_dataloader.dataset)
+    dev_dataloader = BertDataLoader(
+        "./data/glue_data/MNLI/dev_matched.tsv",
+        tokenizer, ["contradiction", "entailment", "neutral"],
+        max_seq_length=config.max_seq_len,
+        batch_size=config.batch_size,
+        line_processor=mnli_line_processor,
+        shuffle=False,
+        phase="predict")
+    trainer_count = fluid.dygraph.parallel.Env().nranks
+    num_train_examples = len(train_dataloader.dataset)
    max_train_steps = config.epoch * num_train_examples // config.batch_size // trainer_count
    warmup_steps = int(max_train_steps * config.warmup_proportion)
@@ -82,7 +136,6 @@ def train():
        config,
        bert_config,
        len(["contradiction", "entailment", "neutral"]),
-        is_training=True,
        return_pooled_out=True)
    optimizer = Optimizer(
@@ -106,10 +159,15 @@ def train():
    cls_model.bert_layer.init_parameters(
        config.init_pretraining_params, verbose=config.verbose)
-    cls_model.fit(train_data=bert_dataloader.dataloader, epochs=config.epoch)
+    # do train
+    cls_model.fit(train_data=train_dataloader.dataloader,
+                  epochs=config.epoch,
+                  save_dir=config.checkpoints)
-    return cls_model
+    # do eval
+    cls_model.evaluate(
+        eval_data=test_dataloader.dataloader, batch_size=config.batch_size)
 if __name__ == '__main__':
-    cls_model = train()
+    main()
--- a/examples/bert/cls.py
+++ b/examples/bert/cls.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"dygraph transformer layers"
-import six
-import json
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid.dygraph import Linear, Layer
-from hapi.text.bert import BertEncoder
-from hapi.model import Model
-class ClsModelLayer(Model):
-    """
-    classify model
-    """
-    def __init__(self,
-                 args,
-                 config,
-                 num_labels,
-                 is_training=True,
-                 return_pooled_out=True,
-                 use_fp16=False):
-        super(ClsModelLayer, self).__init__()
-        self.config = config
-        self.is_training = is_training
-        self.use_fp16 = use_fp16
-        self.loss_scaling = args.loss_scaling
-        self.bert_layer = BertEncoder(
-            config=self.config, return_pooled_out=True, use_fp16=self.use_fp16)
-        self.cls_fc = Linear(
-            input_dim=self.config["hidden_size"],
-            output_dim=num_labels,
-            param_attr=fluid.ParamAttr(
-                name="cls_out_w",
-                initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
-            bias_attr=fluid.ParamAttr(
-                name="cls_out_b", initializer=fluid.initializer.Constant(0.)))
-    def forward(self, src_ids, position_ids, sentence_ids, input_mask):
-        """
-        forward
-        """
-        enc_output, next_sent_feat = self.bert_layer(src_ids, position_ids,
-                                                     sentence_ids, input_mask)
-        cls_feats = fluid.layers.dropout(
-            x=next_sent_feat,
-            dropout_prob=0.1,
-            dropout_implementation="upscale_in_train")
-        logits = self.cls_fc(cls_feats)
-        return logits
--- a/examples/bert_leveldb/bert.yaml
+++ b/examples/bert_leveldb/bert.yaml
@@ -18,7 +18,7 @@ batch_size: 32
 in_tokens: False
 do_lower_case: True
 random_seed: 5512
-use_cuda: False
+use_cuda: True
 shuffle: True
 do_train: True
 do_test: True

--- a/examples/bert_leveldb/bert_classifier.py
+++ b/examples/bert_leveldb/bert_classifier.py
@@ -16,14 +16,60 @@
 import paddle.fluid as fluid
 from hapi.metrics import Accuracy
 from hapi.configure import Config
+from hapi.text.bert import BertEncoder
+from paddle.fluid.dygraph import Linear, Layer
 from hapi.model import set_device, Model, SoftmaxWithCrossEntropy, Input
-from cls import ClsModelLayer
 import hapi.text.tokenizer.tokenization as tokenization
 from hapi.text.bert import Optimizer, BertConfig, BertDataLoader, BertInputExample
-def train():
+class ClsModelLayer(Model):
+    """
+    classify model
+    """
+    def __init__(self,
+                 args,
+                 config,
+                 num_labels,
+                 return_pooled_out=True,
+                 use_fp16=False):
+        super(ClsModelLayer, self).__init__()
+        self.config = config
+        self.use_fp16 = use_fp16
+        self.loss_scaling = args.loss_scaling
+        self.bert_layer = BertEncoder(
+            config=self.config, return_pooled_out=True, use_fp16=self.use_fp16)
+        self.cls_fc = Linear(
+            input_dim=self.config["hidden_size"],
+            output_dim=num_labels,
+            param_attr=fluid.ParamAttr(
+                name="cls_out_w",
+                initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
+            bias_attr=fluid.ParamAttr(
+                name="cls_out_b", initializer=fluid.initializer.Constant(0.)))
+    def forward(self, src_ids, position_ids, sentence_ids, input_mask):
+        """
+        forward
+        """
+        enc_output, next_sent_feat = self.bert_layer(src_ids, position_ids,
+                                                     sentence_ids, input_mask)
+        cls_feats = fluid.layers.dropout(
+            x=next_sent_feat,
+            dropout_prob=0.1,
+            dropout_implementation="upscale_in_train")
+        pred = self.cls_fc(cls_feats)
+        return pred
+def main():
    config = Config(yaml_file="./bert.yaml")
    config.build()
@@ -35,8 +81,6 @@ def train():
    bert_config = BertConfig(config.bert_config_path)
    bert_config.print_config()
-    trainer_count = fluid.dygraph.parallel.Env().nranks
    tokenizer = tokenization.FullTokenizer(
        vocab_file=config.vocab_path, do_lower_case=config.do_lower_case)
@@ -52,15 +96,26 @@ def train():
        return BertInputExample(
            uid=uid, text_a=text_a, text_b=text_b, label=label)
-    bert_dataloader = BertDataLoader(
+    train_dataloader = BertDataLoader(
        "./data/glue_data/MNLI/train.tsv",
        tokenizer, ["contradiction", "entailment", "neutral"],
-        max_seq_length=64,
+        max_seq_length=config.max_seq_len,
-        batch_size=32,
+        batch_size=config.batch_size,
        line_processor=mnli_line_processor,
-        mode="leveldb")
+        mode="leveldb",
+        phase="train")
-    num_train_examples = len(bert_dataloader.dataset)
+    dev_dataloader = BertDataLoader(
+        "./data/glue_data/MNLI/dev_matched.tsv",
+        tokenizer, ["contradiction", "entailment", "neutral"],
+        max_seq_length=config.max_seq_len,
+        batch_size=config.batch_size,
+        line_processor=mnli_line_processor,
+        shuffle=False,
+        phase="predict")
+    trainer_count = fluid.dygraph.parallel.Env().nranks
+    num_train_examples = len(train_dataloader.dataset)
    max_train_steps = config.epoch * num_train_examples // config.batch_size // trainer_count
    warmup_steps = int(max_train_steps * config.warmup_proportion)
@@ -83,7 +138,6 @@ def train():
        config,
        bert_config,
        len(["contradiction", "entailment", "neutral"]),
-        is_training=True,
        return_pooled_out=True)
    optimizer = Optimizer(
@@ -107,10 +161,15 @@ def train():
    cls_model.bert_layer.init_parameters(
        config.init_pretraining_params, verbose=config.verbose)
-    cls_model.fit(train_data=bert_dataloader.dataloader, epochs=config.epoch)
+    # do train
+    cls_model.fit(train_data=train_dataloader.dataloader,
+                  epochs=config.epoch,
+                  save_dir=config.checkpoints)
-    return cls_model
+    # do eval
+    cls_model.evaluate(
+        eval_data=test_dataloader.dataloader, batch_size=config.batch_size)
 if __name__ == '__main__':
-    cls_model = train()
+    main()
--- a/examples/bert_leveldb/cls.py
+++ b/examples/bert_leveldb/cls.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"dygraph transformer layers"
-import six
-import json
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid.dygraph import Linear, Layer
-from hapi.text.bert import BertEncoder
-from hapi.model import Model
-class ClsModelLayer(Model):
-    """
-    classify model
-    """
-    def __init__(self,
-                 args,
-                 config,
-                 num_labels,
-                 is_training=True,
-                 return_pooled_out=True,
-                 use_fp16=False):
-        super(ClsModelLayer, self).__init__()
-        self.config = config
-        self.is_training = is_training
-        self.use_fp16 = use_fp16
-        self.loss_scaling = args.loss_scaling
-        self.bert_layer = BertEncoder(
-            config=self.config, return_pooled_out=True, use_fp16=self.use_fp16)
-        self.cls_fc = Linear(
-            input_dim=self.config["hidden_size"],
-            output_dim=num_labels,
-            param_attr=fluid.ParamAttr(
-                name="cls_out_w",
-                initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
-            bias_attr=fluid.ParamAttr(
-                name="cls_out_b", initializer=fluid.initializer.Constant(0.)))
-    def forward(self, src_ids, position_ids, sentence_ids, input_mask):
-        """
-        forward
-        """
-        enc_output, next_sent_feat = self.bert_layer(src_ids, position_ids,
-                                                     sentence_ids, input_mask)
-        cls_feats = fluid.layers.dropout(
-            x=next_sent_feat,
-            dropout_prob=0.1,
-            dropout_implementation="upscale_in_train")
-        logits = self.cls_fc(cls_feats)
-        return logits
--- a/examples/bert_leveldb/nohup.out
+++ b/examples/bert_leveldb/nohup.out
-grep: warning: GREP_OPTIONS is deprecated; please use an alias or script
-2020-04-13 13:08:30,568-WARNING: use_shared_memory can only be used in multi-process mode(num_workers > 0), set use_shared_memory as False
-W0413 13:08:31.584532 119379 device_context.cc:237] Please NOTE: device: 0, CUDA Capability: 70, Driver API Version: 10.1, Runtime API Version: 9.0
-W0413 13:08:31.589192 119379 device_context.cc:245] device: 0, cuDNN Version: 7.5.
----------------------------------------------------------------------
-bert_config_path:				./data/pretrained_models/uncased_L-12_H-768_A-12//bert_config.json
-init_checkpoint:				None
-init_pretraining_params:				./data/pretrained_models/uncased_L-12_H-768_A-12//dygraph_params/
-checkpoints:				./data/saved_model/mnli_models
-epoch:				3
-learning_rate:				5e-05
-lr_scheduler:				linear_warmup_decay
-weight_decay:				0.01
-warmup_proportion:				0.1
-save_steps:				1000
-validation_steps:				100
-loss_scaling:				1.0
-skip_steps:				10
-data_dir:				./data/glue_data/MNLI/
-vocab_path:				./data/pretrained_models/uncased_L-12_H-768_A-12//vocab.txt
-max_seq_len:				128
-batch_size:				64
-in_tokens:				False
-do_lower_case:				True
-random_seed:				5512
-use_cuda:				True
-shuffle:				True
-do_train:				True
-do_test:				True
-use_data_parallel:				False
-verbose:				False
----------------------------------------------------------------------
-attention_probs_dropout_prob: 0.1
-hidden_act: gelu
-hidden_dropout_prob: 0.1
-hidden_size: 768
-initializer_range: 0.02
-intermediate_size: 3072
-max_position_embeddings: 512
-num_attention_heads: 12
-num_hidden_layers: 12
-type_vocab_size: 2
-vocab_size: 30522
------------------------------------------------
-Trainer count: 1
-Num train examples: 392703
-Max train steps: 18407
-Num warmup steps: 1840
-Epoch 1/3
-step    10/12272 - loss: 1.1000 - acc_top1: 0.3531 - acc_top2: 0.6813 - 1s/step
-step    20/12272 - loss: 1.1878 - acc_top1: 0.3578 - acc_top2: 0.6875 - 1s/step
-step    30/12272 - loss: 1.0812 - acc_top1: 0.3708 - acc_top2: 0.6948 - 1s/step
-step    40/12272 - loss: 1.1244 - acc_top1: 0.3773 - acc_top2: 0.6992 - 1s/step
-step    50/12272 - loss: 1.1202 - acc_top1: 0.3756 - acc_top2: 0.7006 - 1s/step
-step    60/12272 - loss: 1.1291 - acc_top1: 0.3703 - acc_top2: 0.6990 - 1s/step
-step    70/12272 - loss: 1.0991 - acc_top1: 0.3634 - acc_top2: 0.6946 - 1s/step
-step    80/12272 - loss: 1.0988 - acc_top1: 0.3602 - acc_top2: 0.6914 - 1s/step
-step    90/12272 - loss: 1.0718 - acc_top1: 0.3646 - acc_top2: 0.6889 - 1s/step
-step   100/12272 - loss: 1.0949 - acc_top1: 0.3638 - acc_top2: 0.6878 - 1s/step
-step   110/12272 - loss: 1.1120 - acc_top1: 0.3608 - acc_top2: 0.6895 - 1s/step
-step   120/12272 - loss: 1.1105 - acc_top1: 0.3622 - acc_top2: 0.6922 - 1s/step
-step   130/12272 - loss: 1.0958 - acc_top1: 0.3623 - acc_top2: 0.6940 - 1s/step
-step   140/12272 - loss: 1.0995 - acc_top1: 0.3636 - acc_top2: 0.6926 - 1s/step
-step   150/12272 - loss: 1.1272 - acc_top1: 0.3671 - acc_top2: 0.6950 - 1s/step
-step   160/12272 - loss: 1.0850 - acc_top1: 0.3697 - acc_top2: 0.6975 - 1s/step
-step   170/12272 - loss: 1.0607 - acc_top1: 0.3691 - acc_top2: 0.6991 - 1s/step
-step   180/12272 - loss: 1.0623 - acc_top1: 0.3707 - acc_top2: 0.6991 - 1s/step
-step   190/12272 - loss: 1.1092 - acc_top1: 0.3697 - acc_top2: 0.6997 - 1s/step
-step   200/12272 - loss: 1.1046 - acc_top1: 0.3713 - acc_top2: 0.7030 - 1s/step
-step   210/12272 - loss: 1.0945 - acc_top1: 0.3720 - acc_top2: 0.7043 - 1s/step
-step   220/12272 - loss: 1.0935 - acc_top1: 0.3719 - acc_top2: 0.7051 - 1s/step
-step   230/12272 - loss: 1.1567 - acc_top1: 0.3742 - acc_top2: 0.7048 - 1s/step
-step   240/12272 - loss: 1.0745 - acc_top1: 0.3766 - acc_top2: 0.7081 - 1s/step
-step   250/12272 - loss: 1.0664 - acc_top1: 0.3756 - acc_top2: 0.7090 - 1s/step
-step   260/12272 - loss: 1.0770 - acc_top1: 0.3751 - acc_top2: 0.7085 - 1s/step
-step   270/12272 - loss: 1.1008 - acc_top1: 0.3730 - acc_top2: 0.7088 - 1s/step
-step   280/12272 - loss: 1.0850 - acc_top1: 0.3737 - acc_top2: 0.7098 - 1s/step
-step   290/12272 - loss: 1.0759 - acc_top1: 0.3747 - acc_top2: 0.7100 - 1s/step
-step   300/12272 - loss: 1.0352 - acc_top1: 0.3758 - acc_top2: 0.7108 - 1s/step
-step   310/12272 - loss: 1.0224 - acc_top1: 0.3786 - acc_top2: 0.7127 - 1s/step
-step   320/12272 - loss: 1.0919 - acc_top1: 0.3800 - acc_top2: 0.7137 - 1s/step
-step   330/12272 - loss: 1.0884 - acc_top1: 0.3825 - acc_top2: 0.7145 - 1s/step
-step   340/12272 - loss: 1.1380 - acc_top1: 0.3849 - acc_top2: 0.7157 - 1s/step
-step   350/12272 - loss: 0.9523 - acc_top1: 0.3890 - acc_top2: 0.7176 - 1s/step
-step   360/12272 - loss: 0.9963 - acc_top1: 0.3922 - acc_top2: 0.7191 - 1s/step
-step   370/12272 - loss: 1.1187 - acc_top1: 0.3955 - acc_top2: 0.7205 - 1s/step
-step   380/12272 - loss: 0.9634 - acc_top1: 0.3988 - acc_top2: 0.7229 - 1s/step
-step   390/12272 - loss: 0.9944 - acc_top1: 0.4017 - acc_top2: 0.7254 - 1s/step
-step   400/12272 - loss: 1.1071 - acc_top1: 0.4044 - acc_top2: 0.7272 - 1s/step
-step   410/12272 - loss: 0.9307 - acc_top1: 0.4070 - acc_top2: 0.7293 - 1s/step
-step   420/12272 - loss: 1.1307 - acc_top1: 0.4087 - acc_top2: 0.7315 - 1s/step
-step   430/12272 - loss: 0.9936 - acc_top1: 0.4110 - acc_top2: 0.7334 - 1s/step
-step   440/12272 - loss: 0.9791 - acc_top1: 0.4139 - acc_top2: 0.7357 - 1s/step
-step   450/12272 - loss: 1.0112 - acc_top1: 0.4147 - acc_top2: 0.7372 - 1s/step
-step   460/12272 - loss: 0.8554 - acc_top1: 0.4179 - acc_top2: 0.7395 - 1s/step
-step   470/12272 - loss: 0.9411 - acc_top1: 0.4198 - acc_top2: 0.7406 - 1s/step
-step   480/12272 - loss: 0.8481 - acc_top1: 0.4231 - acc_top2: 0.7424 - 1s/step
-step   490/12272 - loss: 1.0338 - acc_top1: 0.4261 - acc_top2: 0.7441 - 1s/step
-step   500/12272 - loss: 0.9651 - acc_top1: 0.4281 - acc_top2: 0.7459 - 1s/step
-step   510/12272 - loss: 0.8091 - acc_top1: 0.4306 - acc_top2: 0.7479 - 1s/step
-step   520/12272 - loss: 1.0528 - acc_top1: 0.4325 - acc_top2: 0.7489 - 1s/step
-step   530/12272 - loss: 0.9898 - acc_top1: 0.4338 - acc_top2: 0.7500 - 1s/step
-step   540/12272 - loss: 0.7900 - acc_top1: 0.4364 - acc_top2: 0.7519 - 1s/step
-step   550/12272 - loss: 0.9055 - acc_top1: 0.4389 - acc_top2: 0.7534 - 1s/step
-step   560/12272 - loss: 1.0092 - acc_top1: 0.4410 - acc_top2: 0.7549 - 1s/step
-step   570/12272 - loss: 0.7068 - acc_top1: 0.4441 - acc_top2: 0.7570 - 1s/step
-step   580/12272 - loss: 0.9695 - acc_top1: 0.4455 - acc_top2: 0.7581 - 1s/step
-step   590/12272 - loss: 0.8640 - acc_top1: 0.4487 - acc_top2: 0.7600 - 1s/step
-step   600/12272 - loss: 0.9068 - acc_top1: 0.4514 - acc_top2: 0.7618 - 1s/step
-step   610/12272 - loss: 0.9023 - acc_top1: 0.4524 - acc_top2: 0.7627 - 1s/step
-step   620/12272 - loss: 0.7377 - acc_top1: 0.4552 - acc_top2: 0.7640 - 1s/step
-step   630/12272 - loss: 0.8900 - acc_top1: 0.4574 - acc_top2: 0.7659 - 1s/step
-step   640/12272 - loss: 0.8902 - acc_top1: 0.4590 - acc_top2: 0.7669 - 1s/step
-step   650/12272 - loss: 0.9069 - acc_top1: 0.4608 - acc_top2: 0.7686 - 1s/step
-step   660/12272 - loss: 0.9630 - acc_top1: 0.4631 - acc_top2: 0.7699 - 1s/step
-step   670/12272 - loss: 0.9005 - acc_top1: 0.4652 - acc_top2: 0.7712 - 1s/step
-step   680/12272 - loss: 1.0725 - acc_top1: 0.4670 - acc_top2: 0.7725 - 1s/step
-step   690/12272 - loss: 0.8322 - acc_top1: 0.4689 - acc_top2: 0.7739 - 1s/step
-step   700/12272 - loss: 0.9874 - acc_top1: 0.4714 - acc_top2: 0.7753 - 1s/step
-step   710/12272 - loss: 0.7915 - acc_top1: 0.4728 - acc_top2: 0.7765 - 1s/step
-step   720/12272 - loss: 0.7174 - acc_top1: 0.4746 - acc_top2: 0.7777 - 1s/step
-step   730/12272 - loss: 0.7635 - acc_top1: 0.4770 - acc_top2: 0.7793 - 1s/step
-step   740/12272 - loss: 0.9180 - acc_top1: 0.4793 - acc_top2: 0.7804 - 1s/step
-step   750/12272 - loss: 0.8424 - acc_top1: 0.4817 - acc_top2: 0.7815 - 1s/step
-step   760/12272 - loss: 0.9357 - acc_top1: 0.4837 - acc_top2: 0.7829 - 1s/step
-step   770/12272 - loss: 0.7643 - acc_top1: 0.4858 - acc_top2: 0.7839 - 1s/step
-step   780/12272 - loss: 0.8910 - acc_top1: 0.4868 - acc_top2: 0.7849 - 1s/step
-step   790/12272 - loss: 0.8781 - acc_top1: 0.4888 - acc_top2: 0.7862 - 1s/step
-step   800/12272 - loss: 0.8005 - acc_top1: 0.4907 - acc_top2: 0.7877 - 1s/step
-step   810/12272 - loss: 0.6740 - acc_top1: 0.4929 - acc_top2: 0.7889 - 1s/step
-step   820/12272 - loss: 0.7026 - acc_top1: 0.4947 - acc_top2: 0.7898 - 1s/step
-step   830/12272 - loss: 0.8666 - acc_top1: 0.4964 - acc_top2: 0.7908 - 1s/step
-step   840/12272 - loss: 0.6296 - acc_top1: 0.4983 - acc_top2: 0.7920 - 1s/step
-step   850/12272 - loss: 0.7907 - acc_top1: 0.4992 - acc_top2: 0.7930 - 1s/step
-step   860/12272 - loss: 0.7292 - acc_top1: 0.5007 - acc_top2: 0.7935 - 1s/step
-step   870/12272 - loss: 0.7498 - acc_top1: 0.5026 - acc_top2: 0.7944 - 1s/step
-step   880/12272 - loss: 0.9928 - acc_top1: 0.5040 - acc_top2: 0.7953 - 1s/step
-step   890/12272 - loss: 1.0025 - acc_top1: 0.5056 - acc_top2: 0.7962 - 1s/step
-step   900/12272 - loss: 0.7810 - acc_top1: 0.5071 - acc_top2: 0.7969 - 1s/step
-step   910/12272 - loss: 0.6114 - acc_top1: 0.5090 - acc_top2: 0.7978 - 1s/step
-step   920/12272 - loss: 0.7780 - acc_top1: 0.5105 - acc_top2: 0.7988 - 1s/step
-step   930/12272 - loss: 0.9457 - acc_top1: 0.5116 - acc_top2: 0.7995 - 1s/step
-step   940/12272 - loss: 0.7907 - acc_top1: 0.5135 - acc_top2: 0.8006 - 1s/step
-step   950/12272 - loss: 0.5520 - acc_top1: 0.5153 - acc_top2: 0.8013 - 1s/step
-step   960/12272 - loss: 0.8251 - acc_top1: 0.5168 - acc_top2: 0.8022 - 1s/step
-step   970/12272 - loss: 0.8482 - acc_top1: 0.5179 - acc_top2: 0.8031 - 1s/step
-step   980/12272 - loss: 0.8010 - acc_top1: 0.5196 - acc_top2: 0.8038 - 1s/step
-step   990/12272 - loss: 0.8326 - acc_top1: 0.5207 - acc_top2: 0.8047 - 1s/step
-step  1000/12272 - loss: 0.6979 - acc_top1: 0.5222 - acc_top2: 0.8057 - 1s/step
-step  1010/12272 - loss: 0.7506 - acc_top1: 0.5234 - acc_top2: 0.8065 - 1s/step
-step  1020/12272 - loss: 0.8457 - acc_top1: 0.5248 - acc_top2: 0.8073 - 1s/step
-step  1030/12272 - loss: 0.8698 - acc_top1: 0.5263 - acc_top2: 0.8082 - 1s/step
-step  1040/12272 - loss: 0.7016 - acc_top1: 0.5279 - acc_top2: 0.8091 - 1s/step
-step  1050/12272 - loss: 0.7766 - acc_top1: 0.5290 - acc_top2: 0.8099 - 1s/step
-step  1060/12272 - loss: 0.7994 - acc_top1: 0.5300 - acc_top2: 0.8105 - 1s/step
-step  1070/12272 - loss: 0.7053 - acc_top1: 0.5317 - acc_top2: 0.8115 - 1s/step
-step  1080/12272 - loss: 0.9085 - acc_top1: 0.5330 - acc_top2: 0.8125 - 1s/step
-step  1090/12272 - loss: 0.7556 - acc_top1: 0.5342 - acc_top2: 0.8134 - 1s/step
-step  1100/12272 - loss: 0.9364 - acc_top1: 0.5355 - acc_top2: 0.8141 - 1s/step
-step  1110/12272 - loss: 0.9403 - acc_top1: 0.5367 - acc_top2: 0.8148 - 1s/step
-step  1120/12272 - loss: 0.8228 - acc_top1: 0.5375 - acc_top2: 0.8152 - 1s/step
-step  1130/12272 - loss: 0.6802 - acc_top1: 0.5388 - acc_top2: 0.8160 - 1s/step
-step  1140/12272 - loss: 0.8222 - acc_top1: 0.5397 - acc_top2: 0.8167 - 1s/step
-step  1150/12272 - loss: 0.9321 - acc_top1: 0.5407 - acc_top2: 0.8172 - 1s/step
-step  1160/12272 - loss: 0.7478 - acc_top1: 0.5417 - acc_top2: 0.8181 - 1s/step
-step  1170/12272 - loss: 0.7976 - acc_top1: 0.5430 - acc_top2: 0.8188 - 1s/step
-step  1180/12272 - loss: 0.7386 - acc_top1: 0.5441 - acc_top2: 0.8192 - 1s/step
-step  1190/12272 - loss: 0.6448 - acc_top1: 0.5450 - acc_top2: 0.8200 - 1s/step
-step  1200/12272 - loss: 0.7441 - acc_top1: 0.5463 - acc_top2: 0.8206 - 1s/step
-step  1210/12272 - loss: 0.8171 - acc_top1: 0.5476 - acc_top2: 0.8213 - 1s/step
-step  1220/12272 - loss: 0.7480 - acc_top1: 0.5487 - acc_top2: 0.8219 - 1s/step
-step  1230/12272 - loss: 0.6363 - acc_top1: 0.5497 - acc_top2: 0.8225 - 1s/step
-step  1240/12272 - loss: 0.6630 - acc_top1: 0.5507 - acc_top2: 0.8231 - 1s/step
-step  1250/12272 - loss: 0.8668 - acc_top1: 0.5517 - acc_top2: 0.8237 - 1s/step
-step  1260/12272 - loss: 0.6057 - acc_top1: 0.5527 - acc_top2: 0.8243 - 1s/step
-step  1270/12272 - loss: 0.8432 - acc_top1: 0.5538 - acc_top2: 0.8248 - 1s/step
-step  1280/12272 - loss: 0.8447 - acc_top1: 0.5546 - acc_top2: 0.8253 - 1s/step
-step  1290/12272 - loss: 0.6928 - acc_top1: 0.5556 - acc_top2: 0.8261 - 1s/step
-step  1300/12272 - loss: 0.7872 - acc_top1: 0.5567 - acc_top2: 0.8266 - 1s/step
-step  1310/12272 - loss: 0.7968 - acc_top1: 0.5570 - acc_top2: 0.8269 - 1s/step
-step  1320/12272 - loss: 0.8059 - acc_top1: 0.5580 - acc_top2: 0.8275 - 1s/step
-step  1330/12272 - loss: 0.8603 - acc_top1: 0.5587 - acc_top2: 0.8278 - 1s/step
-step  1340/12272 - loss: 0.7872 - acc_top1: 0.5599 - acc_top2: 0.8285 - 1s/step
-step  1350/12272 - loss: 0.7037 - acc_top1: 0.5609 - acc_top2: 0.8290 - 1s/step
-step  1360/12272 - loss: 0.8268 - acc_top1: 0.5618 - acc_top2: 0.8297 - 1s/step
-step  1370/12272 - loss: 0.5962 - acc_top1: 0.5627 - acc_top2: 0.8303 - 1s/step
-step  1380/12272 - loss: 0.7712 - acc_top1: 0.5638 - acc_top2: 0.8310 - 1s/step
-step  1390/12272 - loss: 0.5770 - acc_top1: 0.5650 - acc_top2: 0.8315 - 1s/step
-step  1400/12272 - loss: 0.7174 - acc_top1: 0.5656 - acc_top2: 0.8319 - 1s/step
-step  1410/12272 - loss: 0.6224 - acc_top1: 0.5660 - acc_top2: 0.8323 - 1s/step
-step  1420/12272 - loss: 0.6782 - acc_top1: 0.5671 - acc_top2: 0.8328 - 1s/step
-step  1430/12272 - loss: 0.4087 - acc_top1: 0.5682 - acc_top2: 0.8335 - 1s/step
-step  1440/12272 - loss: 0.7534 - acc_top1: 0.5692 - acc_top2: 0.8342 - 1s/step
-step  1450/12272 - loss: 0.6446 - acc_top1: 0.5702 - acc_top2: 0.8345 - 1s/step
-step  1460/12272 - loss: 0.6606 - acc_top1: 0.5712 - acc_top2: 0.8351 - 1s/step
-step  1470/12272 - loss: 0.7308 - acc_top1: 0.5723 - acc_top2: 0.8357 - 1s/step
-step  1480/12272 - loss: 0.9016 - acc_top1: 0.5727 - acc_top2: 0.8359 - 1s/step
-step  1490/12272 - loss: 0.8445 - acc_top1: 0.5730 - acc_top2: 0.8362 - 1s/step
-step  1500/12272 - loss: 0.8217 - acc_top1: 0.5737 - acc_top2: 0.8367 - 1s/step
-step  1510/12272 - loss: 0.8413 - acc_top1: 0.5747 - acc_top2: 0.8370 - 1s/step
-step  1520/12272 - loss: 0.4643 - acc_top1: 0.5757 - acc_top2: 0.8376 - 1s/step
-step  1530/12272 - loss: 0.9351 - acc_top1: 0.5764 - acc_top2: 0.8381 - 1s/step
-step  1540/12272 - loss: 0.7856 - acc_top1: 0.5773 - acc_top2: 0.8386 - 1s/step
-step  1550/12272 - loss: 0.5921 - acc_top1: 0.5780 - acc_top2: 0.8390 - 1s/step
-step  1560/12272 - loss: 0.4460 - acc_top1: 0.5788 - acc_top2: 0.8395 - 1s/step
-step  1570/12272 - loss: 0.6814 - acc_top1: 0.5793 - acc_top2: 0.8401 - 1s/step
-step  1580/12272 - loss: 0.4115 - acc_top1: 0.5805 - acc_top2: 0.8407 - 1s/step
-step  1590/12272 - loss: 0.9326 - acc_top1: 0.5810 - acc_top2: 0.8410 - 1s/step
-step  1600/12272 - loss: 0.6989 - acc_top1: 0.5818 - acc_top2: 0.8413 - 1s/step
-step  1610/12272 - loss: 0.5238 - acc_top1: 0.5826 - acc_top2: 0.8418 - 1s/step
-step  1620/12272 - loss: 0.5827 - acc_top1: 0.5832 - acc_top2: 0.8422 - 1s/step
-step  1630/12272 - loss: 0.7703 - acc_top1: 0.5838 - acc_top2: 0.8425 - 1s/step
-step  1640/12272 - loss: 0.7926 - acc_top1: 0.5844 - acc_top2: 0.8428 - 1s/step
-step  1650/12272 - loss: 0.7143 - acc_top1: 0.5851 - acc_top2: 0.8434 - 1s/step
-step  1660/12272 - loss: 0.6240 - acc_top1: 0.5858 - acc_top2: 0.8438 - 1s/step
-step  1670/12272 - loss: 0.7869 - acc_top1: 0.5862 - acc_top2: 0.8440 - 1s/step
-step  1680/12272 - loss: 0.6485 - acc_top1: 0.5868 - acc_top2: 0.8444 - 1s/step
-step  1690/12272 - loss: 0.7539 - acc_top1: 0.5876 - acc_top2: 0.8450 - 1s/step
-step  1700/12272 - loss: 0.6173 - acc_top1: 0.5882 - acc_top2: 0.8454 - 1s/step
-step  1710/12272 - loss: 0.8056 - acc_top1: 0.5890 - acc_top2: 0.8458 - 1s/step
-step  1720/12272 - loss: 0.7035 - acc_top1: 0.5898 - acc_top2: 0.8463 - 1s/step
-step  1730/12272 - loss: 0.5892 - acc_top1: 0.5908 - acc_top2: 0.8468 - 1s/step
-step  1740/12272 - loss: 0.7755 - acc_top1: 0.5915 - acc_top2: 0.8472 - 1s/step
-step  1750/12272 - loss: 0.6911 - acc_top1: 0.5920 - acc_top2: 0.8474 - 1s/step
-step  1760/12272 - loss: 0.6309 - acc_top1: 0.5926 - acc_top2: 0.8477 - 1s/step
-step  1770/12272 - loss: 0.7506 - acc_top1: 0.5932 - acc_top2: 0.8480 - 1s/step
-step  1780/12272 - loss: 0.8711 - acc_top1: 0.5939 - acc_top2: 0.8482 - 1s/step
-step  1790/12272 - loss: 0.9146 - acc_top1: 0.5945 - acc_top2: 0.8484 - 1s/step
-step  1800/12272 - loss: 0.6208 - acc_top1: 0.5952 - acc_top2: 0.8487 - 1s/step
-step  1810/12272 - loss: 0.8506 - acc_top1: 0.5959 - acc_top2: 0.8490 - 1s/step
-step  1820/12272 - loss: 0.8330 - acc_top1: 0.5965 - acc_top2: 0.8494 - 1s/step
-step  1830/12272 - loss: 0.8315 - acc_top1: 0.5970 - acc_top2: 0.8497 - 1s/step
-step  1840/12272 - loss: 0.6227 - acc_top1: 0.5977 - acc_top2: 0.8501 - 1s/step
-step  1850/12272 - loss: 0.5972 - acc_top1: 0.5985 - acc_top2: 0.8506 - 1s/step
-step  1860/12272 - loss: 0.6309 - acc_top1: 0.5992 - acc_top2: 0.8510 - 1s/step
-step  1870/12272 - loss: 0.8707 - acc_top1: 0.5995 - acc_top2: 0.8512 - 1s/step
-step  1880/12272 - loss: 0.6419 - acc_top1: 0.6004 - acc_top2: 0.8516 - 1s/step
-step  1890/12272 - loss: 0.6015 - acc_top1: 0.6010 - acc_top2: 0.8521 - 1s/step
-step  1900/12272 - loss: 0.6000 - acc_top1: 0.6015 - acc_top2: 0.8524 - 1s/step
-step  1910/12272 - loss: 0.7010 - acc_top1: 0.6020 - acc_top2: 0.8527 - 1s/step
-step  1920/12272 - loss: 0.8539 - acc_top1: 0.6026 - acc_top2: 0.8530 - 1s/step
-step  1930/12272 - loss: 0.8381 - acc_top1: 0.6031 - acc_top2: 0.8533 - 1s/step
-step  1940/12272 - loss: 0.5921 - acc_top1: 0.6039 - acc_top2: 0.8537 - 1s/step
-step  1950/12272 - loss: 0.4974 - acc_top1: 0.6047 - acc_top2: 0.8541 - 1s/step
-step  1960/12272 - loss: 0.8269 - acc_top1: 0.6052 - acc_top2: 0.8544 - 1s/step
-step  1970/12272 - loss: 0.6157 - acc_top1: 0.6058 - acc_top2: 0.8548 - 1s/step
-step  1980/12272 - loss: 1.0949 - acc_top1: 0.6064 - acc_top2: 0.8552 - 1s/step
-step  1990/12272 - loss: 0.6442 - acc_top1: 0.6070 - acc_top2: 0.8555 - 1s/step
-step  2000/12272 - loss: 0.8747 - acc_top1: 0.6073 - acc_top2: 0.8558 - 1s/step
-step  2010/12272 - loss: 0.8101 - acc_top1: 0.6078 - acc_top2: 0.8560 - 1s/step
-step  2020/12272 - loss: 0.8623 - acc_top1: 0.6082 - acc_top2: 0.8562 - 1s/step
-step  2030/12272 - loss: 0.6664 - acc_top1: 0.6089 - acc_top2: 0.8567 - 1s/step
-step  2040/12272 - loss: 0.7616 - acc_top1: 0.6092 - acc_top2: 0.8567 - 1s/step
-step  2050/12272 - loss: 0.7282 - acc_top1: 0.6095 - acc_top2: 0.8570 - 1s/step
-step  2060/12272 - loss: 0.6914 - acc_top1: 0.6099 - acc_top2: 0.8574 - 1s/step
-step  2070/12272 - loss: 0.6129 - acc_top1: 0.6105 - acc_top2: 0.8577 - 1s/step
-step  2080/12272 - loss: 0.5605 - acc_top1: 0.6111 - acc_top2: 0.8580 - 1s/step
-step  2090/12272 - loss: 0.6432 - acc_top1: 0.6116 - acc_top2: 0.8582 - 1s/step
-step  2100/12272 - loss: 0.6783 - acc_top1: 0.6121 - acc_top2: 0.8586 - 1s/step
-step  2110/12272 - loss: 0.5949 - acc_top1: 0.6128 - acc_top2: 0.8589 - 1s/step
-step  2120/12272 - loss: 0.7832 - acc_top1: 0.6134 - acc_top2: 0.8592 - 1s/step
-step  2130/12272 - loss: 0.6633 - acc_top1: 0.6139 - acc_top2: 0.8594 - 1s/step
-step  2140/12272 - loss: 0.8456 - acc_top1: 0.6143 - acc_top2: 0.8596 - 1s/step
-step  2150/12272 - loss: 0.7133 - acc_top1: 0.6150 - acc_top2: 0.8599 - 1s/step
-step  2160/12272 - loss: 0.4699 - acc_top1: 0.6155 - acc_top2: 0.8602 - 1s/step
-step  2170/12272 - loss: 0.6013 - acc_top1: 0.6161 - acc_top2: 0.8605 - 1s/step
-step  2180/12272 - loss: 0.5676 - acc_top1: 0.6165 - acc_top2: 0.8608 - 1s/step
-step  2190/12272 - loss: 0.5850 - acc_top1: 0.6172 - acc_top2: 0.8611 - 1s/step
-step  2200/12272 - loss: 0.6887 - acc_top1: 0.6177 - acc_top2: 0.8612 - 1s/step
-step  2210/12272 - loss: 0.5706 - acc_top1: 0.6180 - acc_top2: 0.8614 - 1s/step
-step  2220/12272 - loss: 0.8251 - acc_top1: 0.6184 - acc_top2: 0.8617 - 1s/step
-step  2230/12272 - loss: 0.6532 - acc_top1: 0.6188 - acc_top2: 0.8620 - 1s/step
-step  2240/12272 - loss: 0.5888 - acc_top1: 0.6194 - acc_top2: 0.8623 - 1s/step
-step  2250/12272 - loss: 0.6360 - acc_top1: 0.6198 - acc_top2: 0.8625 - 1s/step
-step  2260/12272 - loss: 1.0555 - acc_top1: 0.6202 - acc_top2: 0.8628 - 1s/step
-step  2270/12272 - loss: 0.4848 - acc_top1: 0.6207 - acc_top2: 0.8629 - 1s/step
-step  2280/12272 - loss: 0.7243 - acc_top1: 0.6212 - acc_top2: 0.8632 - 1s/step
-step  2290/12272 - loss: 0.4358 - acc_top1: 0.6216 - acc_top2: 0.8635 - 1s/step
-step  2300/12272 - loss: 0.5473 - acc_top1: 0.6221 - acc_top2: 0.8637 - 1s/step
-step  2310/12272 - loss: 0.6440 - acc_top1: 0.6226 - acc_top2: 0.8640 - 1s/step
-step  2320/12272 - loss: 0.5785 - acc_top1: 0.6233 - acc_top2: 0.8643 - 1s/step
-step  2330/12272 - loss: 0.7199 - acc_top1: 0.6237 - acc_top2: 0.8646 - 1s/step
-step  2340/12272 - loss: 0.5622 - acc_top1: 0.6241 - acc_top2: 0.8647 - 1s/step
-step  2350/12272 - loss: 0.6742 - acc_top1: 0.6245 - acc_top2: 0.8650 - 1s/step
-step  2360/12272 - loss: 0.8149 - acc_top1: 0.6249 - acc_top2: 0.8652 - 1s/step
-step  2370/12272 - loss: 0.5900 - acc_top1: 0.6253 - acc_top2: 0.8654 - 1s/step
-step  2380/12272 - loss: 0.8046 - acc_top1: 0.6256 - acc_top2: 0.8656 - 1s/step
-step  2390/12272 - loss: 0.6097 - acc_top1: 0.6262 - acc_top2: 0.8659 - 1s/step
-step  2400/12272 - loss: 0.5936 - acc_top1: 0.6266 - acc_top2: 0.8660 - 1s/step
-step  2410/12272 - loss: 0.7245 - acc_top1: 0.6270 - acc_top2: 0.8662 - 1s/step
-step  2420/12272 - loss: 0.6349 - acc_top1: 0.6274 - acc_top2: 0.8665 - 1s/step
-step  2430/12272 - loss: 0.7009 - acc_top1: 0.6278 - acc_top2: 0.8668 - 1s/step
-step  2440/12272 - loss: 0.3881 - acc_top1: 0.6282 - acc_top2: 0.8670 - 1s/step
-step  2450/12272 - loss: 0.5226 - acc_top1: 0.6286 - acc_top2: 0.8673 - 1s/step
-step  2460/12272 - loss: 0.5748 - acc_top1: 0.6292 - acc_top2: 0.8675 - 1s/step
-step  2470/12272 - loss: 0.4798 - acc_top1: 0.6297 - acc_top2: 0.8678 - 1s/step
-step  2480/12272 - loss: 0.5857 - acc_top1: 0.6303 - acc_top2: 0.8680 - 1s/step
-step  2490/12272 - loss: 0.6729 - acc_top1: 0.6308 - acc_top2: 0.8683 - 1s/step
-step  2500/12272 - loss: 0.6392 - acc_top1: 0.6312 - acc_top2: 0.8686 - 1s/step
-step  2510/12272 - loss: 0.9607 - acc_top1: 0.6315 - acc_top2: 0.8687 - 1s/step
-step  2520/12272 - loss: 0.6036 - acc_top1: 0.6319 - acc_top2: 0.8690 - 1s/step
-step  2530/12272 - loss: 0.6505 - acc_top1: 0.6324 - acc_top2: 0.8693 - 1s/step
-step  2540/12272 - loss: 0.4558 - acc_top1: 0.6329 - acc_top2: 0.8696 - 1s/step
-step  2550/12272 - loss: 0.4215 - acc_top1: 0.6333 - acc_top2: 0.8699 - 1s/step
-step  2560/12272 - loss: 0.6908 - acc_top1: 0.6338 - acc_top2: 0.8701 - 1s/step
-step  2570/12272 - loss: 0.5833 - acc_top1: 0.6342 - acc_top2: 0.8703 - 1s/step
-step  2580/12272 - loss: 0.8548 - acc_top1: 0.6346 - acc_top2: 0.8706 - 1s/step
-step  2590/12272 - loss: 0.5770 - acc_top1: 0.6351 - acc_top2: 0.8708 - 1s/step
-step  2600/12272 - loss: 0.4476 - acc_top1: 0.6355 - acc_top2: 0.8711 - 1s/step
-step  2610/12272 - loss: 0.4145 - acc_top1: 0.6360 - acc_top2: 0.8714 - 1s/step
-step  2620/12272 - loss: 0.6625 - acc_top1: 0.6365 - acc_top2: 0.8717 - 1s/step
-step  2630/12272 - loss: 0.4808 - acc_top1: 0.6369 - acc_top2: 0.8719 - 1s/step
--- a/examples/bert_leveldb/run_classifier_multi_gpu.sh
+++ b/examples/bert_leveldb/run_classifier_multi_gpu.sh
+#!/bin/bash
+BERT_BASE_PATH="./data/pretrained_models/uncased_L-12_H-768_A-12/"
+TASK_NAME='MNLI'
+DATA_PATH="./data/glue_data/MNLI/"
+CKPT_PATH="./data/saved_model/mnli_models"
+# start fine-tuning
+python3.7 -m paddle.distributed.launch --started_port 8899 --selected_gpus=0,1,2,3 bert_classifier.py\
+    --use_cuda true \
+    --do_train true \
+    --do_test true \
+    --batch_size 64 \
+    --init_pretraining_params ${BERT_BASE_PATH}/dygraph_params/ \
+    --data_dir ${DATA_PATH} \
+    --vocab_path ${BERT_BASE_PATH}/vocab.txt \
+    --checkpoints ${CKPT_PATH} \
+    --save_steps 1000 \
+    --weight_decay  0.01 \
+    --warmup_proportion 0.1 \
+    --validation_steps 100 \
+    --epoch 3 \
+    --max_seq_len 128 \
+    --bert_config_path ${BERT_BASE_PATH}/bert_config.json \
+    --learning_rate 5e-5 \
+    --skip_steps 10 \
+    --shuffle true
--- a/examples/bert_leveldb/run_classifier_single_gpu.sh
+++ b/examples/bert_leveldb/run_classifier_single_gpu.sh
@@ -4,7 +4,7 @@ TASK_NAME='MNLI'
 DATA_PATH="./data/glue_data/MNLI/"
 CKPT_PATH="./data/saved_model/mnli_models"
-export CUDA_VISIBLE_DEVICES=7
+export CUDA_VISIBLE_DEVICES=0
 # start fine-tuning
 python3.7 bert_classifier.py\

--- a/hapi/text/bert/dataloader.py
+++ b/hapi/text/bert/dataloader.py
@@ -30,6 +30,7 @@ from hapi.distributed import DistributedBatchSampler
 from hapi.text.bert.data_processor import DataProcessor, XnliProcessor, ColaProcessor, MrpcProcessor, MnliProcessor
 from hapi.text.bert.batching import prepare_batch_data
 import hapi.text.tokenizer.tokenization as tokenization
+from paddle.fluid.dygraph.parallel import ParallelEnv, ParallelStrategy
 __all__ = [
    'BertInputExample', 'BertInputFeatures', 'SingleSentenceDataset',
@@ -227,6 +228,9 @@ class SingleSentenceDataset(Dataset):
        if line_processor is None:
            line_processor = default_line_processor
+        if ParallelEnv().nranks > 1:
+            leveldb_file = leveldb_file + "_" + str(ParallelEnv().local_rank)
        if not os.path.exists(leveldb_file):
            print("putting data %s into leveldb %s" %
                  (input_file, leveldb_file))
@@ -384,7 +388,12 @@ class BertDataLoader(object):
                 quotechar=None,
                 device=fluid.CPUPlace(),
                 num_workers=0,
-                 return_list=True):
+                 return_list=True,
+                 phase="train"):
+        assert phase in [
+            "train", "predict", "test"
+        ], "phase of BertDataLoader should be in [train, predict, test], but get %s" % phase
        self.dataset = SingleSentenceDataset(tokenizer, label_list,
                                             max_seq_length, mode)
@@ -394,15 +403,21 @@ class BertDataLoader(object):
                input_file, label_list, max_seq_length, tokenizer,
                line_processor, delimiter, quotechar)
        elif mode == "leveldb":
-            #prepare_leveldb(self, input_file, leveldb_file, label_list, max_seq_length, tokenizer, line_processor=None, delimiter="\t", quotechar=None):
            self.dataset.prepare_leveldb(input_file, leveldb_file, label_list,
                                         max_seq_length, tokenizer,
                                         line_processor, delimiter, quotechar)
        else:
            raise ValueError("mode should be in [all_in_memory, leveldb]")
-        self.sampler = DistributedBatchSampler(
+        if phase == "train":
-            self.dataset, batch_size, shuffle=shuffle, drop_last=drop_last)
+            self.sampler = DistributedBatchSampler(
+                self.dataset, batch_size, shuffle=shuffle, drop_last=drop_last)
+        elif phase == "test" or phase == "predict":
+            self.sampler = BatchSampler(
+                dataset=self.dataset,
+                batch_size=batch_size,
+                shuffle=shuffle,
+                drop_last=drop_last)
        self.dataloader = DataLoader(
            dataset=self.dataset,

--- a/hapi/text/text.py
+++ b/hapi/text/text.py
@@ -22,7 +22,7 @@ import sys
 if six.PY2:
    reload(sys)
    sys.setdefaultencoding('utf8')
 import ast
 import time
 import argparse as argparse
@@ -44,13 +44,12 @@ from paddle.fluid import layers
 from paddle.fluid.dygraph import Layer
 from paddle.fluid.layers import BeamSearchDecoder
 __all__ = [
    'RNNCell', 'BasicLSTMCell', 'BasicGRUCell', 'RNN', 'DynamicDecode',
    'BeamSearchDecoder', 'MultiHeadAttention', 'FFN',
    'TransformerEncoderLayer', 'TransformerEncoder', 'TransformerDecoderLayer',
-    'TransformerDecoder', 'TransformerBeamSearchDecoder', 'GRUCell', 'GRUEncoderCell', 
+    'TransformerDecoder', 'TransformerBeamSearchDecoder', 'Linear_chain_crf',
-    'BiGRU', 'Linear_chain_crf', 'Crf_decoding', 'SequenceTagging'
+    'Crf_decoding', 'SequenceTagging'
 ]
@@ -219,7 +218,19 @@ class BasicLSTMCell(RNNCell):
                 gate_activation=None,
                 activation=None,
                 forget_bias=1.0,
-                 dtype='float32'):
+                 dtype='float32',
+                 forget_gate_weights={"w": None,
+                                      "h": None,
+                                      "b": None},
+                 input_gate_weights={"w": None,
+                                     "h": None,
+                                     "b": None},
+                 output_gate_weights={"w": None,
+                                      "h": None,
+                                      "b": None},
+                 cell_weights={"w": None,
+                               "h": None,
+                               "b": None}):
        super(BasicLSTMCell, self).__init__()
        self._hidden_size = hidden_size
@@ -233,25 +244,225 @@ class BasicLSTMCell(RNNCell):
        self._dtype = dtype
        self._input_size = input_size
-        self._weight = self.create_parameter(
+        self.use_customized_weight = False
-            attr=self._param_attr,
+        for _weights in [
-            shape=[
+                forget_gate_weights, input_gate_weights, output_gate_weights,
-                self._input_size + self._hidden_size, 4 * self._hidden_size
+                cell_weights
-            ],
+        ]:
-            dtype=self._dtype)
+            for _key in _weights:
+                if _weights[_key] is not None:
-        self._bias = self.create_parameter(
+                    self.use_customized_weight = True
-            attr=self._bias_attr,
+                    break
-            shape=[4 * self._hidden_size],
+            if self.use_customized_weight:
-            dtype=self._dtype,
+                break
-            is_bias=True)
+        if not self.use_customized_weight:
+            self._weight = self.create_parameter(
+                attr=self._param_attr,
+                shape=[
+                    self._input_size + self._hidden_size, 4 * self._hidden_size
+                ],
+                dtype=self._dtype)
+            self._bias = self.create_parameter(
+                attr=self._bias_attr,
+                shape=[4 * self._hidden_size],
+                dtype=self._dtype,
+                is_bias=True)
+        else:
+            if "w" in forget_gate_weights and forget_gate_weights[
+                    "w"] is not None:
+                self.fg_w = forget_gate_weights["w"]
+            else:
+                if self._param_attr is not None and self._param_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(self._param_attr)
+                    tmp_param_attr.name += "_forget_gate_w"
+                else:
+                    tmp_param_attr = self._param_attr
+                self.fg_w = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._input_size, self._hidden_size],
+                    dtype=self._dtype)
+            if "h" in forget_gate_weights and forget_gate_weights[
+                    "h"] is not None:
+                self.fg_h = forget_gate_weights["h"]
+            else:
+                if self._param_attr is not None and self._param_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(self._param_attr)
+                    tmp_param_attr.name += "_forget_gate_h"
+                else:
+                    tmp_param_attr = self._param_attr
+                self.fg_h = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._hidden_size, self._hidden_size],
+                    dtype=self._dtype)
+            if "b" in forget_gate_weights and forget_gate_weights[
+                    "b"] is not None:
+                self.fg_b = forget_gate_weights["b"]
+            else:
+                if self._bias_attr is not None and self._bias_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(self._bias_attr)
+                    tmp_param_attr.name += "_forget_gate_b"
+                else:
+                    tmp_param_attr = self._bias_attr
+                self.fg_b = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._hidden_size],
+                    dtype=self._dtype,
+                    is_bias=True)
+            if "w" in input_gate_weights and input_gate_weights[
+                    "w"] is not None:
+                self.ig_w = input_gate_weights["w"]
+            else:
+                if self._param_attr is not None and self._param_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(self._param_attr)
+                    tmp_param_attr.name += "_input_gate_w"
+                else:
+                    tmp_param_attr = self._param_attr
+                self.ig_w = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._input_size, self._hidden_size],
+                    dtype=self._dtype)
+            if "h" in input_gate_weights and input_gate_weights[
+                    "h"] is not None:
+                self.ig_h = input_gate_weights["h"]
+            else:
+                if self._param_attr is not None and self._param_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(self._param_attr)
+                    tmp_param_attr.name += "_input_gate_h"
+                else:
+                    tmp_param_attr = self._param_attr
+                self.ig_h = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._hidden_size, self._hidden_size],
+                    dtype=self._dtype)
+            if "b" in input_gate_weights and input_gate_weights[
+                    "b"] is not None:
+                self.ig_b = input_gate_weights["b"]
+            else:
+                if self._bias_attr is not None and self._bias_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(self._bias_attr)
+                    tmp_param_attr.name += "_input_gate_b"
+                else:
+                    tmp_param_attr = self._bias_attr
+                self.ig_b = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._hidden_size],
+                    dtype=self._dtype,
+                    is_bias=True)
+            if "w" in output_gate_weights and output_gate_weights[
+                    "w"] is not None:
+                self.og_w = output_gate_weights["w"]
+            else:
+                if self._param_attr is not None and self._param_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(self._param_attr)
+                    tmp_param_attr.name += "_output_gate_w"
+                else:
+                    tmp_param_attr = self._param_attr
+                self.og_w = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._input_size, self._hidden_size],
+                    dtype=self._dtype)
+            if "h" in output_gate_weights and output_gate_weights[
+                    "h"] is not None:
+                self.og_h = output_gate_weights["h"]
+            else:
+                if self._param_attr is not None and self._param_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(self._param_attr)
+                    tmp_param_attr.name += "_output_gate_h"
+                else:
+                    tmp_param_attr = self._param_attr
+                self.og_h = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._hidden_size, self._hidden_size],
+                    dtype=self._dtype)
+            if "b" in output_gate_weights and output_gate_weights[
+                    "b"] is not None:
+                self.og_b = output_gate_weights["b"]
+            else:
+                if self._bias_attr is not None and self._bias_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(self._bias_attr)
+                    tmp_param_attr.name += "_output_gate_b"
+                else:
+                    tmp_param_attr = self._bias_attr
+                self.og_b = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._hidden_size],
+                    dtype=self._dtype,
+                    is_bias=True)
+            if "w" in cell_weights and cell_weights["w"] is not None:
+                self.c_w = cell_weights["w"]
+            else:
+                if self._param_attr is not None and self._param_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(self._param_attr)
+                    tmp_param_attr.name += "_cell_w"
+                else:
+                    tmp_param_attr = self._param_attr
+                self.c_w = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._input_size, self._hidden_size],
+                    dtype=self._dtype)
+            if "h" in cell_weights and cell_weights["h"] is not None:
+                self.c_h = cell_weights["h"]
+            else:
+                if self._param_attr is not None and self._param_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(self._param_attr)
+                    tmp_param_attr.name += "_cell_h"
+                else:
+                    tmp_param_attr = self._param_attr
+                self.c_h = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._hidden_size, self._hidden_size],
+                    dtype=self._dtype)
+            if "b" in cell_weights and cell_weights["b"] is not None:
+                self.c_b = cell_weights["b"]
+            else:
+                if self._bias_attr is not None and self._bias_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(self._bias_attr)
+                    tmp_param_attr.name += "_cell_b"
+                else:
+                    tmp_param_attr = self._bias_attr
+                self.c_b = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._hidden_size],
+                    dtype=self._dtype,
+                    is_bias=True)
    def forward(self, input, state):
+        if self.use_customized_weight:
+            weight_w = fluid.layers.concat(
+                [self.ig_w, self.c_w, self.fg_w, self.og_w], axis=-1)
+            weight_h = fluid.layers.concat(
+                [self.ig_h, self.c_h, self.fg_h, self.og_h], axis=-1)
+            _weight = fluid.layers.concat([weight_w, weight_h], axis=0)
+            _bias = fluid.layers.concat(
+                [self.ig_b, self.c_b, self.fg_b, self.og_b])
+        else:
+            _weight = self._weight
+            _bias = self._bias
        pre_hidden, pre_cell = state
        concat_input_hidden = layers.concat([input, pre_hidden], 1)
-        gate_input = layers.matmul(x=concat_input_hidden, y=self._weight)
+        gate_input = layers.matmul(x=concat_input_hidden, y=_weight)
-        gate_input = layers.elementwise_add(gate_input, self._bias)
+        gate_input = layers.elementwise_add(gate_input, _bias)
        i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
        new_cell = layers.elementwise_add(
            layers.elementwise_mul(
@@ -308,7 +519,16 @@ class BasicGRUCell(RNNCell):
                 bias_attr=None,
                 gate_activation=None,
                 activation=None,
-                 dtype='float32'):
+                 dtype='float32',
+                 update_gate_weights={"w": None,
+                                      "h": None,
+                                      "b": None},
+                 reset_gate_weights={"w": None,
+                                     "h": None,
+                                     "b": None},
+                 cell_weights={"w": None,
+                               "h": None,
+                               "b": None}):
        super(BasicGRUCell, self).__init__()
        self._input_size = input_size
        self._hidden_size = hidden_size
@@ -318,6 +538,20 @@ class BasicGRUCell(RNNCell):
        self._activation = activation or layers.tanh
        self._dtype = dtype
+        assert isinstance(update_gate_weights, dict)
+        assert isinstance(reset_gate_weights, dict)
+        assert isinstance(cell_weights, dict)
+        self.use_customized_weight = False
+        for _weights in [
+                update_gate_weights, reset_gate_weights, cell_weights
+        ]:
+            for _key in _weights:
+                if _weights[_key] is not None:
+                    self.use_customized_weight = True
+            if self.use_customized_weight:
+                break
        if self._param_attr is not None and self._param_attr.name is not None:
            gate_param_attr = copy.deepcopy(self._param_attr)
            candidate_param_attr = copy.deepcopy(self._param_attr)
@@ -327,43 +561,194 @@ class BasicGRUCell(RNNCell):
            gate_param_attr = self._param_attr
            candidate_param_attr = self._param_attr
-        self._gate_weight = self.create_parameter(
+        if not self.use_customized_weight:
-            attr=gate_param_attr,
+            self._gate_weight = self.create_parameter(
-            shape=[self._input_size + self._hidden_size, 2 * self._hidden_size],
+                attr=gate_param_attr,
-            dtype=self._dtype)
+                shape=[
+                    self._input_size + self._hidden_size, 2 * self._hidden_size
-        self._candidate_weight = self.create_parameter(
+                ],
-            attr=candidate_param_attr,
+                dtype=self._dtype)
-            shape=[self._input_size + self._hidden_size, self._hidden_size],
-            dtype=self._dtype)
+            self._candidate_weight = self.create_parameter(
+                attr=candidate_param_attr,
+                shape=[
+                    self._input_size + self._hidden_size, self._hidden_size
+                ],
+                dtype=self._dtype)
+            if self._bias_attr is not None and self._bias_attr.name is not None:
+                gate_bias_attr = copy.deepcopy(self._bias_attr)
+                candidate_bias_attr = copy.deepcopy(self._bias_attr)
+                gate_bias_attr.name += "_gate"
+                candidate_bias_attr.name += "_candidate"
+            else:
+                gate_bias_attr = self._bias_attr
+                candidate_bias_attr = self._bias_attr
+            self._gate_bias = self.create_parameter(
+                attr=gate_bias_attr,
+                shape=[2 * self._hidden_size],
+                dtype=self._dtype,
+                is_bias=True)
+            self._candidate_bias = self.create_parameter(
+                attr=candidate_bias_attr,
+                shape=[self._hidden_size],
+                dtype=self._dtype,
+                is_bias=True)
-        if self._bias_attr is not None and self._bias_attr.name is not None:
-            gate_bias_attr = copy.deepcopy(self._bias_attr)
-            candidate_bias_attr = copy.deepcopy(self._bias_attr)
-            gate_bias_attr.name += "_gate"
-            candidate_bias_attr.name += "_candidate"
        else:
-            gate_bias_attr = self._bias_attr
-            candidate_bias_attr = self._bias_attr
+            # create the parameters of gates in gru
+            if "w" in update_gate_weights and update_gate_weights[
-        self._gate_bias = self.create_parameter(
+                    "w"] is not None:
-            attr=gate_bias_attr,
+                self.ug_w = update_gate_weights["w"]
-            shape=[2 * self._hidden_size],
+            else:
-            dtype=self._dtype,
+                if gate_param_attr is not None and gate_param_attr.name is not None:
-            is_bias=True)
+                    tmp_param_attr = copy.deepcopy(gate_param_attr)
-        self._candidate_bias = self.create_parameter(
+                    tmp_param_attr.name += "_update_gate_w"
-            attr=candidate_bias_attr,
+                else:
-            shape=[self._hidden_size],
+                    tmp_param_attr = gate_param_attr
-            dtype=self._dtype,
+                self.ug_w = self.create_parameter(
-            is_bias=True)
+                    attr=tmp_param_attr,
+                    shape=[self._input_size, self._hidden_size],
+                    dtype=self._dtype)
+            if "h" in update_gate_weights and update_gate_weights[
+                    "h"] is not None:
+                self.ug_h = update_gate_weights["h"]
+            else:
+                if gate_param_attr is not None and gate_param_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(gate_param_attr)
+                    tmp_param_attr.name += "_update_gate_h"
+                else:
+                    tmp_param_attr = gate_param_attr
+                self.ug_h = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._hidden_size, self._hidden_size],
+                    dtype=self._dtype)
+            if "b" in update_gate_weights and update_gate_weights[
+                    "b"] is not None:
+                self.ug_b = update_gate_weights["b"]
+            else:
+                if gate_bias_attr is not None and gate_bias_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(gate_bias_attr)
+                    tmp_param_attr.name += "_update_gate_b"
+                else:
+                    tmp_param_attr = gate_bias_attr
+                self.ug_b = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._hidden_size],
+                    dtype=self._dtype,
+                    is_bias=True)
+            # reset gate parameters
+            if "w" in reset_gate_weights and reset_gate_weights[
+                    "w"] is not None:
+                self.rg_w = reset_gate_weights["w"]
+            else:
+                if gate_param_attr is not None and gate_param_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(gate_param_attr)
+                    tmp_param_attr.name += "_reset_gate_w"
+                else:
+                    tmp_param_attr = gate_param_attr
+                self.rg_w = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._input_size, self._hidden_size],
+                    dtype=self._dtype)
+            if "h" in reset_gate_weights and reset_gate_weights[
+                    "h"] is not None:
+                self.rg_h = reset_gate_weights["h"]
+            else:
+                if gate_param_attr is not None and gate_param_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(gate_param_attr)
+                    tmp_param_attr.name += "_reset_gate_h"
+                else:
+                    tmp_param_attr = gate_param_attr
+                self.rg_h = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._hidden_size, self._hidden_size],
+                    dtype=self._dtype)
+            if "b" in reset_gate_weights and reset_gate_weights[
+                    "b"] is not None:
+                self.rg_b = reused_params["b"]
+            else:
+                if gate_bias_attr is not None and gate_bias_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(gate_bias_attr)
+                    tmp_param_attr.name += "_reset_gate_b"
+                else:
+                    tmp_param_attr = gate_bias_attr
+                self.rg_b = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._hidden_size],
+                    dtype=self._dtype,
+                    is_bias=True)
+            # cell parameters
+            if "w" in cell_weights and cell_weights["w"] is not None:
+                self.c_w = cell_weights["w"]
+            else:
+                if candidate_param_attr is not None and candidate_param_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(candidate_param_attr)
+                    tmp_param_attr.name += "_cell_w"
+                else:
+                    tmp_param_attr = gate_param_attr
+                self.c_w = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._input_size, self._hidden_size],
+                    dtype=self._dtype)
+            if "h" in cell_weights and cell_weights["h"] is not None:
+                self.c_h = cell_weights["h"]
+            else:
+                if candidate_param_attr is not None and candidate_param_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(candidate_param_attr)
+                    tmp_param_attr.name += "_cell_h"
+                else:
+                    tmp_param_attr = gate_param_attr
+                self.c_h = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._hidden_size, self._hidden_size],
+                    dtype=self._dtype)
+            if "b" in cell_weights and cell_weights["b"] is not None:
+                self.c_b = cell_weights["b"]
+            else:
+                if candidate_bias_attr is not None and candidate_bias_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(candidate_bias_attr)
+                    tmp_param_attr.name += "_cell_b"
+                else:
+                    tmp_param_attr = gate_bias_attr
+                self.c_b = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._hidden_size],
+                    dtype=self._dtype,
+                    is_bias=True)
    def forward(self, input, state):
+        if self.use_customized_weight:
+            rg_weights = layers.concat([self.rg_w, self.rg_h], axis=0)
+            ug_weights = layers.concat([self.ug_w, self.ug_h], axis=0)
+            _gate_weight = layers.concat([rg_weights, ug_weights], axis=-1)
+            _candidate_weight = layers.concat([self.c_w, self.c_h], axis=0)
+            _gate_bias = layers.concat([self.rg_b, self.ug_b], axis=0)
+            _candidate_bias = self.c_b
+        else:
+            _gate_weight = self._gate_weight
+            _gate_bias = self._gate_bias
+            _candidate_weight = self._candidate_weight
+            _candidate_bias = self._candidate_bias
        pre_hidden = state
        concat_input_hidden = layers.concat([input, pre_hidden], axis=1)
-        gate_input = layers.matmul(x=concat_input_hidden, y=self._gate_weight)
+        gate_input = layers.matmul(x=concat_input_hidden, y=_gate_weight)
-        gate_input = layers.elementwise_add(gate_input, self._gate_bias)
+        gate_input = layers.elementwise_add(gate_input, _gate_bias)
        gate_input = self._gate_activation(gate_input)
        r, u = layers.split(gate_input, num_or_sections=2, dim=1)
@@ -371,8 +756,8 @@ class BasicGRUCell(RNNCell):
        r_hidden = r * pre_hidden
        candidate = layers.matmul(
-            layers.concat([input, r_hidden], 1), self._candidate_weight)
+            layers.concat([input, r_hidden], 1), _candidate_weight)
-        candidate = layers.elementwise_add(candidate, self._candidate_bias)
+        candidate = layers.elementwise_add(candidate, _candidate_bias)
        c = self._activation(candidate)
        new_hidden = u * pre_hidden + (1 - u) * c
@@ -700,7 +1085,11 @@ class PrePostProcessLayer(Layer):
    PrePostProcessLayer
    """
-    def __init__(self, process_cmd, d_model, dropout_rate):
+    def __init__(self,
+                 process_cmd,
+                 d_model,
+                 dropout_rate,
+                 reused_layer_norm=None):
        super(PrePostProcessLayer, self).__init__()
        self.process_cmd = process_cmd
        self.functors = []
@@ -708,16 +1097,21 @@ class PrePostProcessLayer(Layer):
            if cmd == "a":  # add residual connection
                self.functors.append(lambda x, y: x + y if y else x)
            elif cmd == "n":  # add layer normalization
+                if reused_layer_norm is not None:
+                    layer_norm = reused_layer_norm
+                else:
+                    layer_norm = LayerNorm(
+                        normalized_shape=d_model,
+                        param_attr=fluid.ParamAttr(
+                            initializer=fluid.initializer.Constant(1.)),
+                        bias_attr=fluid.ParamAttr(
+                            initializer=fluid.initializer.Constant(0.)))
                self.functors.append(
                    self.add_sublayer(
                        "layer_norm_%d" % len(
                            self.sublayers(include_sublayers=False)),
-                        LayerNorm(
+                        layer_norm))
-                            normalized_shape=d_model,
-                            param_attr=fluid.ParamAttr(
-                                initializer=fluid.initializer.Constant(1.)),
-                            bias_attr=fluid.ParamAttr(
-                                initializer=fluid.initializer.Constant(0.)))))
            elif cmd == "d":  # add dropout
                self.functors.append(lambda x: layers.dropout(
                    x, dropout_prob=dropout_rate, is_test=False)
@@ -737,21 +1131,48 @@ class MultiHeadAttention(Layer):
    Multi-Head Attention
    """
-    def __init__(self, d_key, d_value, d_model, n_head=1, dropout_rate=0.):
+    def __init__(self,
+                 d_key,
+                 d_value,
+                 d_model,
+                 n_head=1,
+                 dropout_rate=0.0,
+                 reused_query_fc=None,
+                 reused_key_fc=None,
+                 reused_value_fc=None,
+                 reused_proj_fc=None):
        super(MultiHeadAttention, self).__init__()
        self.n_head = n_head
        self.d_key = d_key
        self.d_value = d_value
        self.d_model = d_model
        self.dropout_rate = dropout_rate
-        self.q_fc = Linear(
-            input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
+        if reused_query_fc is not None:
-        self.k_fc = Linear(
+            self.q_fc = reused_query_fc
-            input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
+        else:
-        self.v_fc = Linear(
+            self.q_fc = Linear(
-            input_dim=d_model, output_dim=d_value * n_head, bias_attr=False)
+                input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
-        self.proj_fc = Linear(
+        if reused_key_fc is not None:
-            input_dim=d_value * n_head, output_dim=d_model, bias_attr=False)
+            self.k_fc = reused_key_fc
+        else:
+            self.k_fc = Linear(
+                input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
+        if reused_value_fc is not None:
+            self.v_fc = reused_value_fc
+        else:
+            self.v_fc = Linear(
+                input_dim=d_model,
+                output_dim=d_value * n_head,
+                bias_attr=False)
+        if reused_proj_fc is not None:
+            self.proj_fc = reused_proj_fc
+        else:
+            self.proj_fc = Linear(
+                input_dim=d_value * n_head,
+                output_dim=d_model,
+                bias_attr=False)
    def _prepare_qkv(self, queries, keys, values, cache=None):
        if keys is None:  # self-attention
@@ -828,12 +1249,24 @@ class FFN(Layer):
    Feed-Forward Network
    """
-    def __init__(self, d_inner_hid, d_model, dropout_rate):
+    def __init__(self,
+                 d_inner_hid,
+                 d_model,
+                 dropout_rate,
+                 fc1_act="relu",
+                 reused_fc1=None,
+                 reused_fc2=None):
        super(FFN, self).__init__()
        self.dropout_rate = dropout_rate
-        self.fc1 = Linear(
+        if reused_fc1 is not None:
-            input_dim=d_model, output_dim=d_inner_hid, act="relu")
+            self.fc1 = reused_fc1
-        self.fc2 = Linear(input_dim=d_inner_hid, output_dim=d_model)
+        else:
+            self.fc1 = Linear(
+                input_dim=d_model, output_dim=d_inner_hid, act=fc1_act)
+        if reused_fc2 is not None:
+            self.fc2 = reused_fc2
+        else:
+            self.fc2 = Linear(input_dim=d_inner_hid, output_dim=d_model)
    def forward(self, x):
        hidden = self.fc1(x)
@@ -859,22 +1292,52 @@ class TransformerEncoderLayer(Layer):
                 attention_dropout,
                 relu_dropout,
                 preprocess_cmd="n",
-                 postprocess_cmd="da"):
+                 postprocess_cmd="da",
+                 ffn_fc1_act="relu",
+                 reused_pre_selatt_layernorm=None,
+                 reused_multihead_att_weights={
+                     "reused_query_fc": None,
+                     "reused_key_fc": None,
+                     "reused_value_fc": None,
+                     "reused_proj_fc": None
+                 },
+                 reused_post_selfatt_layernorm=None,
+                 reused_pre_ffn_layernorm=None,
+                 reused_ffn_weights={"reused_fc1": None,
+                                     "reused_fc2": None},
+                 reused_post_ffn_layernorm=None):
        super(TransformerEncoderLayer, self).__init__()
        self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout)
+                                                 prepostprocess_dropout,
-        self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
+                                                 reused_pre_selatt_layernorm)
-                                            attention_dropout)
+        self.self_attn = MultiHeadAttention(
-        self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model,
+            d_key,
-                                                  prepostprocess_dropout)
+            d_value,
+            d_model,
+            n_head,
+            attention_dropout,
+            reused_query_fc=reused_multihead_att_weights["reused_query_fc"],
+            reused_key_fc=reused_multihead_att_weights["reused_key_fc"],
+            reused_value_fc=reused_multihead_att_weights["reused_value_fc"],
+            reused_proj_fc=reused_multihead_att_weights["reused_proj_fc"])
+        self.postprocesser1 = PrePostProcessLayer(
+            postprocess_cmd, d_model, prepostprocess_dropout,
+            reused_post_selfatt_layernorm)
        self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout)
+                                                 prepostprocess_dropout,
-        self.ffn = FFN(d_inner_hid, d_model, relu_dropout)
+                                                 reused_pre_ffn_layernorm)
+        self.ffn = FFN(d_inner_hid,
+                       d_model,
+                       relu_dropout,
+                       fc1_act=ffn_fc1_act,
+                       reused_fc1=reused_ffn_weights["reused_fc1"],
+                       reused_fc2=reused_ffn_weights["reused_fc2"])
        self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
-                                                  prepostprocess_dropout)
+                                                  prepostprocess_dropout,
+                                                  reused_post_ffn_layernorm)
    def forward(self, enc_input, attn_bias):
        attn_output = self.self_attn(
@@ -902,7 +1365,8 @@ class TransformerEncoder(Layer):
                 attention_dropout,
                 relu_dropout,
                 preprocess_cmd="n",
-                 postprocess_cmd="da"):
+                 postprocess_cmd="da",
+                 ffn_fc1_act="relu"):
        super(TransformerEncoder, self).__init__()
@@ -912,9 +1376,17 @@ class TransformerEncoder(Layer):
                self.add_sublayer(
                    "layer_%d" % i,
                    TransformerEncoderLayer(
-                        n_head, d_key, d_value, d_model, d_inner_hid,
+                        n_head,
-                        prepostprocess_dropout, attention_dropout,
+                        d_key,
-                        relu_dropout, preprocess_cmd, postprocess_cmd)))
+                        d_value,
+                        d_model,
+                        d_inner_hid,
+                        prepostprocess_dropout,
+                        attention_dropout,
+                        relu_dropout,
+                        preprocess_cmd,
+                        postprocess_cmd,
+                        ffn_fc1_act=ffn_fc1_act)))
        self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
                                             prepostprocess_dropout)
@@ -941,28 +1413,79 @@ class TransformerDecoderLayer(Layer):
                 attention_dropout,
                 relu_dropout,
                 preprocess_cmd="n",
-                 postprocess_cmd="da"):
+                 postprocess_cmd="da",
+                 reused_pre_selfatt_layernorm=None,
+                 reused_self_multihead_att_weights={
+                     "reused_query_fc": None,
+                     "reused_key_fc": None,
+                     "reused_value_fc": None,
+                     "reused_proj_fc": None
+                 },
+                 reused_post_selfatt_layernorm=None,
+                 reused_pre_crossatt_layernorm=None,
+                 reused_cross_multihead_att_weights={
+                     "reused_query_fc": None,
+                     "reused_key_fc": None,
+                     "reused_value_fc": None,
+                     "reused_proj_fc": None
+                 },
+                 reused_post_crossatt_layernorm=None,
+                 reused_pre_ffn_layernorm=None,
+                 reused_ffn_weights={"reused_fc1": None,
+                                     "reused_fc2": None},
+                 reused_post_ffn_layernorm=None):
        super(TransformerDecoderLayer, self).__init__()
        self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout)
+                                                 prepostprocess_dropout,
-        self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
+                                                 reused_pre_selfatt_layernorm)
-                                            attention_dropout)
+        self.self_attn = MultiHeadAttention(
-        self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model,
+            d_key,
-                                                  prepostprocess_dropout)
+            d_value,
+            d_model,
+            n_head,
+            attention_dropout,
+            reused_query_fc=reused_self_multihead_att_weights[
+                "reused_query_fc"],
+            reused_key_fc=reused_self_multihead_att_weights["reused_key_fc"],
+            reused_value_fc=reused_self_multihead_att_weights[
+                "reused_value_fc"],
+            reused_proj_fc=reused_self_multihead_att_weights["reused_proj_fc"])
+        self.postprocesser1 = PrePostProcessLayer(
+            postprocess_cmd, d_model, prepostprocess_dropout,
+            reused_post_selfatt_layernorm)
        self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout)
+                                                 prepostprocess_dropout,
-        self.cross_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
+                                                 reused_pre_crossatt_layernorm)
-                                             attention_dropout)
+        self.cross_attn = MultiHeadAttention(
-        self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
+            d_key,
-                                                  prepostprocess_dropout)
+            d_value,
+            d_model,
+            n_head,
+            attention_dropout,
+            reused_query_fc=reused_cross_multihead_att_weights[
+                "reused_query_fc"],
+            reused_key_fc=reused_cross_multihead_att_weights["reused_key_fc"],
+            reused_value_fc=reused_cross_multihead_att_weights[
+                "reused_value_fc"],
+            reused_proj_fc=reused_cross_multihead_att_weights[
+                "reused_proj_fc"])
+        self.postprocesser2 = PrePostProcessLayer(
+            postprocess_cmd, d_model, prepostprocess_dropout,
+            reused_post_crossatt_layernorm)
        self.preprocesser3 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout)
+                                                 prepostprocess_dropout,
-        self.ffn = FFN(d_inner_hid, d_model, relu_dropout)
+                                                 reused_pre_ffn_layernorm)
+        self.ffn = FFN(d_inner_hid,
+                       d_model,
+                       relu_dropout,
+                       reused_fc1=reused_ffn_weights["reused_fc1"],
+                       reused_fc2=reused_ffn_weights["reused_fc2"])
        self.postprocesser3 = PrePostProcessLayer(postprocess_cmd, d_model,
-                                                  prepostprocess_dropout)
+                                                  prepostprocess_dropout,
+                                                  reused_post_ffn_layernorm)
    def forward(self,
                dec_input,
@@ -1031,7 +1554,7 @@ class TransformerDecoder(Layer):
        ]
+#TODO: we should merge GRUCell with BasicGRUCell
 class GRUCell(RNNCell):
    def __init__(self,
                 input_size,
@@ -1044,9 +1567,7 @@ class GRUCell(RNNCell):
        super(GRUCell, self).__init__()
        self.hidden_size = hidden_size
        self.fc_layer = Linear(
-            input_size,
+            input_size, hidden_size * 3, param_attr=param_attr)
-            hidden_size * 3,
-            param_attr=param_attr)
        self.gru_unit = GRUUnit(
            hidden_size * 3,
@@ -1067,7 +1588,8 @@ class GRUCell(RNNCell):
        return [self.hidden_size]
-class GRUEncoderCell(RNNCell): 
+#TODO: we should merge GRUCell with BasicGRUCell
+class GRUEncoderCell(RNNCell):
    def __init__(self,
                 num_layers,
                 input_size,
@@ -1086,8 +1608,9 @@ class GRUEncoderCell(RNNCell):
                    GRUCell(
                        input_size=input_size if i == 0 else hidden_size,
                        hidden_size=hidden_size,
-                        param_attr=fluid.ParamAttr(initializer=fluid.initializer.UniformInitializer(
+                        param_attr=fluid.ParamAttr(
-                             low=-init_scale, high=init_scale)))))
+                            initializer=fluid.initializer.UniformInitializer(
+                                low=-init_scale, high=init_scale)))))
    def forward(self, step_input, states):
        new_states = []
@@ -1109,18 +1632,17 @@ class GRUEncoderCell(RNNCell):
 class BiGRU(fluid.dygraph.Layer):
    def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None):
        super(BiGRU, self).__init__()
-        self.gru = RNN(GRUEncoderCell(1, input_dim,
+        self.gru = RNN(GRUEncoderCell(1, input_dim, grnn_hidden_dim, 0.0,
-                                   grnn_hidden_dim, 0.0, init_bound),
+                                      init_bound),
                       is_reverse=False,
                       time_major=False)
-        self.gru_r = RNN(GRUEncoderCell(1, input_dim,
+        self.gru_r = RNN(GRUEncoderCell(1, input_dim, grnn_hidden_dim, 0.0,
-                                   grnn_hidden_dim, 0.0, init_bound),
+                                        init_bound),
-                       is_reverse=True,
+                         is_reverse=True,
-                       time_major=False)
+                         time_major=False)
-    def forward(self, input_feature): 
+    def forward(self, input_feature):
        pre_gru, pre_state = self.gru(input_feature)
        gru_r, r_state = self.gru_r(input_feature)
        bi_merge = fluid.layers.concat(input=[pre_gru, gru_r], axis=-1)
@@ -1320,14 +1842,14 @@ class SequenceTagging(fluid.dygraph.Layer):
        emission = self.fc(bigru_output)
-        if target is not None: 
+        if target is not None:
            crf_cost = self.linear_chain_crf(
                input=emission, label=target, length=lengths)
            avg_cost = fluid.layers.mean(x=crf_cost)
            self.crf_decoding.weight = self.linear_chain_crf.weight
            crf_decode = self.crf_decoding(input=emission, length=lengths)
            return crf_decode, avg_cost, lengths
-        else: 
+        else:
            self.linear_chain_crf.weight = self.crf_decoding.weight
            crf_decode = self.crf_decoding(input=emission, length=lengths)
            return crf_decode, lengths