add bert, refine text.py

601db3ab · xyzhou-puck · d90f7cc6 · 601db3ab · 601db3ab · 601db3ab
29 changed file
--- a/bert/bert_classifier.py
+++ b/bert/bert_classifier.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT fine-tuning in Paddle Dygraph Mode."""
+
+import os
+import io
+import time
+import argparse
+import paddle.fluid as fluid
+
+from paddle.fluid.dygraph import to_variable
+from hapi.text.text import PrePostProcessLayer
+from hapi.text.bert.bert import BertConfig
+from cls import ClsModelLayer
+from hapi.text.bert.optimization import Optimizer
+from hapi.text.bert.utils.args import ArgumentGroup, print_arguments, check_cuda
+from hapi.model import set_device, Model, SoftmaxWithCrossEntropy, Input
+from hapi.metrics import Accuracy
+
+from hapi.text.bert.dataloader import SingleSentenceDataLoader, BertInputExample
+import hapi.text.tokenizer.tokenization as tokenization
+
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+model_g = ArgumentGroup(parser, "model", "model configuration and paths.")
+model_g.add_arg("bert_config_path",      str,  "./config/bert_config.json",  "Path to the json file for bert model config.")
+model_g.add_arg("init_checkpoint",       str,  None,                         "Init checkpoint to resume training from.")
+model_g.add_arg("init_pretraining_params",  str,  None,
+                "Init pre-training params which preforms fine-tuning from. If the "
+                "arg 'init_checkpoint' has been set, this argument wouldn't be valid.")
+model_g.add_arg("checkpoints",           str,  "checkpoints",                "Path to save checkpoints.")
+
+train_g = ArgumentGroup(parser, "training", "training options.")
+train_g.add_arg("epoch",             int,    100,     "Number of epoches for training.")
+train_g.add_arg("learning_rate",     float,  0.0001,  "Learning rate used to train with warmup.")
+train_g.add_arg("lr_scheduler",      str,    "linear_warmup_decay",
+                "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay'])
+train_g.add_arg("weight_decay",      float,  0.01,    "Weight decay rate for L2 regularizer.")
+train_g.add_arg("warmup_proportion",     float,  0.1,                         "Proportion of training steps to perform linear learning rate warmup for.")
+train_g.add_arg("save_steps",        int,    10000,   "The steps interval to save checkpoints.")
+train_g.add_arg("validation_steps",  int,    1000,    "The steps interval to evaluate model performance.")
+train_g.add_arg("loss_scaling",      float,  1.0,
+                "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.")
+
+log_g = ArgumentGroup(parser,     "logging", "logging related.")
+log_g.add_arg("skip_steps",          int,    10,    "The steps interval to print loss.")
+
+data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options")
+data_g.add_arg("data_dir",            str,  None,       "Path to training data.")
+data_g.add_arg("vocab_path",          str,  None,       "Vocabulary path.")
+data_g.add_arg("max_seq_len",         int,  512,                   "Tokens' number of the longest seqence allowed.")
+data_g.add_arg("batch_size",          int,  32,
+               "The total number of examples in one batch for training, see also --in_tokens.")
+data_g.add_arg("in_tokens",           bool, False,
+               "If set, the batch size will be the maximum number of tokens in one batch. "
+               "Otherwise, it will be the maximum number of examples in one batch.")
+data_g.add_arg("do_lower_case", bool, True,
+               "Whether to lower case the input text. Should be True for uncased models and False for cased models.")
+data_g.add_arg("random_seed",   int,  5512,     "Random seed.")
+
+run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
+run_type_g.add_arg("use_cuda",                     bool,   True,   "If set, use GPU for training.")
+run_type_g.add_arg("shuffle",                      bool,   True,  "")
+run_type_g.add_arg("task_name",                    str,    None,
+                   "The name of task to perform fine-tuning, should be in {'xnli', 'mnli', 'cola', 'mrpc'}.")
+run_type_g.add_arg("do_train",                     bool,   True,  "Whether to perform training.")
+run_type_g.add_arg("do_test",                      bool,   False,  "Whether to perform evaluation on test data set.")
+run_type_g.add_arg("use_data_parallel", bool, False,  "The flag indicating whether to shuffle instances in each pass.")
+run_type_g.add_arg("enable_ce", bool, False,  help="The flag indicating whether to run the task for continuous evaluation.")
+
+args = parser.parse_args()
+
+def create_data(batch):
+    """
+    convert data to variable
+    """
+    src_ids = to_variable(batch[0], "src_ids")
+    position_ids = to_variable(batch[1], "position_ids")
+    sentence_ids = to_variable(batch[2], "sentence_ids")
+    input_mask = to_variable(batch[3], "input_mask")
+    labels = to_variable(batch[4], "labels")
+    labels.stop_gradient = True
+    return src_ids, position_ids, sentence_ids, input_mask, labels
+
+def train(args):
+
+    device = set_device("gpu" if args.use_cuda else "cpu")
+    fluid.enable_dygraph(device)
+
+    bert_config = BertConfig(args.bert_config_path)
+    bert_config.print_config()
+
+    if not (args.do_train or args.do_test):
+        raise ValueError("For args `do_train`, `do_test`, at "
+                        "least one of them must be True.")
+
+    trainer_count = fluid.dygraph.parallel.Env().nranks
+
+    tokenizer = tokenization.FullTokenizer(
+        vocab_file=args.vocab_path, do_lower_case=args.do_lower_case)
+
+    def mnli_line_processor(line_id, line):
+        if line_id == "0":
+            return None
+        uid = tokenization.convert_to_unicode(line[0])
+        text_a = tokenization.convert_to_unicode(line[8])
+        text_b = tokenization.convert_to_unicode(line[9])
+        label = tokenization.convert_to_unicode(line[-1])
+        if label not in ["contradiction", "entailment", "neutral"]:
+            label = "contradiction"
+        return BertInputExample(uid=uid, text_a=text_a, text_b=text_b, label=label)
+
+    bert_dataloader = SingleSentenceDataLoader("./data/glue_data/MNLI/train.tsv", tokenizer, ["contradiction", "entailment", "neutral"],
+        max_seq_length=64, batch_size=32, line_processor=mnli_line_processor)
+
+    num_train_examples = len(bert_dataloader.dataset)
+    max_train_steps = args.epoch * num_train_examples // args.batch_size // trainer_count
+    warmup_steps = int(max_train_steps * args.warmup_proportion)
+
+    print("Trainer count: %d" % trainer_count)
+    print("Num train examples: %d" % num_train_examples)
+    print("Max train steps: %d" % max_train_steps)
+    print("Num warmup steps: %d" % warmup_steps)
+
+    if args.use_data_parallel:
+        strategy = fluid.dygraph.parallel.prepare_context()
+
+    inputs = [Input([None, None], 'int64', name='src_ids'),
+              Input([None, None], 'int64', name='pos_ids'),
+              Input([None, None], 'int64', name='sent_ids'),
+              Input([None, None], 'float32', name='input_mask')]
+
+    labels = [Input([None, 1], 'int64', name='label')]
+
+    cls_model = ClsModelLayer(
+                            args,
+                            bert_config,
+                            3,
+                            is_training=True,
+                            return_pooled_out=True)
+
+    optimizer = Optimizer(
+                    warmup_steps=warmup_steps,
+                    num_train_steps=max_train_steps,
+                    learning_rate=args.learning_rate,
+                    model_cls=cls_model,
+                    weight_decay=args.weight_decay,
+                    scheduler=args.lr_scheduler,
+                    loss_scaling=args.loss_scaling,
+                    parameter_list=cls_model.parameters())
+
+    cls_model.prepare(
+        optimizer,
+        SoftmaxWithCrossEntropy(),
+        Accuracy(topk=(1, 2)),
+        inputs,
+        labels,
+        device=device)
+
+    cls_model.bert_layer.init_parameters(args.init_pretraining_params, verbose=True)
+
+    cls_model.fit(train_data=bert_dataloader.dataloader, epochs=args.epoch)
+
+    return cls_model
+
+
+if __name__ == '__main__':
+
+    print_arguments(args)
+    check_cuda(args.use_cuda)
+
+    if args.do_train:
+        cls_model = train(args)
--- a/bert/cls.py
+++ b/bert/cls.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"dygraph transformer layers"
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+import json
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph import Linear, Layer
+
+from hapi.text.bert.bert import BertEncoder
+from hapi.model import Model
+
+
+class ClsModelLayer(Model):
+    """
+    classify model
+    """
+
+    def __init__(self,
+                 args,
+                 config,
+                 num_labels,
+                 is_training=True,
+                 return_pooled_out=True,
+                 use_fp16=False):
+        super(ClsModelLayer, self).__init__()
+        self.config = config
+        self.is_training = is_training
+        self.use_fp16 = use_fp16
+        self.loss_scaling = args.loss_scaling
+
+        self.bert_layer = BertEncoder(
+            config=self.config, return_pooled_out=True, use_fp16=self.use_fp16)
+
+        self.cls_fc = Linear(
+            input_dim=self.config["hidden_size"],
+            output_dim=num_labels,
+            param_attr=fluid.ParamAttr(
+                name="cls_out_w",
+                initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
+            bias_attr=fluid.ParamAttr(
+                name="cls_out_b", initializer=fluid.initializer.Constant(0.)))
+
+    def forward(self, src_ids, position_ids, sentence_ids, input_mask):
+        """
+        forward
+        """
+        #src_ids = data_ids[0]
+        #position_ids = data_ids[1]
+        #sentence_ids = data_ids[2]
+        #input_mask = data_ids[3]
+        #labels = data_ids[4]
+
+        enc_output, next_sent_feat = self.bert_layer(src_ids, position_ids,
+                                                     sentence_ids, input_mask)
+
+        cls_feats = fluid.layers.dropout(
+            x=next_sent_feat,
+            dropout_prob=0.1,
+            dropout_implementation="upscale_in_train")
+
+        logits = self.cls_fc(cls_feats)
+
+        return logits
+        """
+        logits = self.cls_fc(cls_feats)
+
+        ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
+            logits=logits, label=labels, return_softmax=True)
+        loss = fluid.layers.mean(x=ce_loss)
+
+        if self.use_fp16 and self.loss_scaling > 1.0:
+            loss *= self.loss_scaling
+
+        num_seqs = fluid.layers.create_tensor(dtype='int64')
+        accuracy = fluid.layers.accuracy(
+            input=probs, label=labels, total=num_seqs)
+        """
+
+        return loss, accuracy
--- a/bert/run_classifier_single_gpu.sh
+++ b/bert/run_classifier_single_gpu.sh
+#!/bin/bash
+BERT_BASE_PATH="./data/pretrained_models/uncased_L-12_H-768_A-12/"
+TASK_NAME='MNLI'
+DATA_PATH="./data/glue_data/MNLI/"
+CKPT_PATH="./data/saved_model/mnli_models"
+
+export CUDA_VISIBLE_DEVICES=0
+
+# start fine-tuning
+python3.7 bert_classifier.py\
+    --task_name ${TASK_NAME} \
+    --use_cuda true \
+    --do_train true \
+    --do_test true \
+    --batch_size 64 \
+    --init_pretraining_params ${BERT_BASE_PATH}/dygraph_params/ \
+    --data_dir ${DATA_PATH} \
+    --vocab_path ${BERT_BASE_PATH}/vocab.txt \
+    --checkpoints ${CKPT_PATH} \
+    --save_steps 1000 \
+    --weight_decay  0.01 \
+    --warmup_proportion 0.1 \
+    --validation_steps 100 \
+    --epoch 3 \
+    --max_seq_len 128 \
+    --bert_config_path ${BERT_BASE_PATH}/bert_config.json \
+    --learning_rate 5e-5 \
+    --skip_steps 10 \
+    --shuffle true
+
--- a/hapi/__init__.py
+++ b/hapi/__init__.py
--- a/hapi/callbacks.py
+++ b/hapi/callbacks.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import six
+import copy
+
+from hapi.progressbar import ProgressBar
+from paddle.fluid.dygraph.parallel import ParallelEnv
+
+
+def config_callbacks(callbacks=None,
+                     model=None,
+                     batch_size=None,
+                     epochs=None,
+                     steps=None,
+                     log_freq=2,
+                     verbose=2,
+                     save_freq=1,
+                     save_dir=None,
+                     metrics=None,
+                     mode='train'):
+    cbks = callbacks or []
+    cbks = cbks if isinstance(cbks, (list, tuple)) else [cbks]
+    if not any(isinstance(k, ProgBarLogger) for k in cbks) and verbose:
+        cbks = cbks + [ProgBarLogger(log_freq, verbose=verbose)]
+
+    if not any(isinstance(k, ModelCheckpoint) for k in cbks):
+        cbks = cbks + [ModelCheckpoint(save_freq, save_dir)]
+
+    cbk_list = CallbackList(cbks)
+    cbk_list.set_model(model)
+    metrics = metrics or [] if mode != 'test' else []
+    params = {
+        'batch_size': batch_size,
+        'epochs': epochs,
+        'steps': steps,
+        'verbose': verbose,
+        'metrics': metrics,
+    }
+    cbk_list.set_params(params)
+    return cbk_list
+
+
+class CallbackList(object):
+    def __init__(self, callbacks=None):
+        # copy
+        self.callbacks = [c for c in callbacks]
+        self.params = {}
+        self.model = None
+
+    def append(self, callback):
+        self.callbacks.append(callback)
+
+    def __iter__(self):
+        return iter(self.callbacks)
+
+    def set_params(self, params):
+        for c in self.callbacks:
+            c.set_params(params)
+
+    def set_model(self, model):
+        for c in self.callbacks:
+            c.set_model(model)
+
+    def _call(self, name, *args):
+        for c in self.callbacks:
+            func = getattr(c, name)
+            func(*args)
+
+    def _check_mode(self, mode):
+        assert mode in ['train', 'eval', 'test'], \
+            'mode should be train, eval or test'
+
+    def on_begin(self, mode, logs=None):
+        self._check_mode(mode)
+        name = 'on_{}_begin'.format(mode)
+        self._call(name, logs)
+
+    def on_end(self, mode, logs=None):
+        self._check_mode(mode)
+        name = 'on_{}_end'.format(mode)
+        self._call(name, logs)
+
+    def on_epoch_begin(self, epoch=None, logs=None):
+        self._call('on_epoch_begin', epoch, logs)
+
+    def on_epoch_end(self, epoch=None, logs=None):
+        self._call('on_epoch_end', epoch, logs)
+
+    def on_batch_begin(self, mode, step=None, logs=None):
+        self._check_mode(mode)
+        name = 'on_{}_batch_begin'.format(mode)
+        self._call(name, step, logs)
+
+    def on_batch_end(self, mode, step=None, logs=None):
+        self._check_mode(mode)
+        name = 'on_{}_batch_end'.format(mode)
+        self._call(name, step, logs)
+
+
+class Callback(object):
+    def __init__(self):
+        self.model = None
+        self.params = {}
+
+    def set_params(self, params):
+        self.params = params
+
+    def set_model(self, model):
+        self.model = model
+
+    def on_train_begin(self, logs=None):
+        """
+        """
+
+    def on_train_end(self, logs=None):
+        """
+        """
+
+    def on_eval_begin(self, logs=None):
+        """
+        """
+
+    def on_eval_end(self, logs=None):
+        """
+        """
+
+    def on_test_begin(self, logs=None):
+        """
+        """
+
+    def on_test_end(self, logs=None):
+        """
+        """
+
+    def on_epoch_begin(self, epoch, logs=None):
+        """
+        """
+
+    def on_epoch_end(self, epoch, logs=None):
+        """
+        """
+
+    def on_train_batch_begin(self, step, logs=None):
+        """
+        """
+
+    def on_train_batch_end(self, step, logs=None):
+        """
+        """
+
+    def on_eval_batch_begin(self, step, logs=None):
+        """
+        """
+
+    def on_eval_batch_end(self, step, logs=None):
+        """
+        """
+
+    def on_eval_batch_begin(self, step, logs=None):
+        """
+        """
+
+    def on_eval_batch_end(self, step, logs=None):
+        """
+        """
+
+
+class ProgBarLogger(Callback):
+    def __init__(self, log_freq=1, verbose=2):
+        self.epochs = None
+        self.steps = None
+        self.progbar = None
+        self.verbose = verbose
+        self.log_freq = log_freq
+
+    def on_train_begin(self, logs=None):
+        self.epochs = self.params['epochs']
+        assert self.epochs
+        self.train_metrics = self.params['metrics']
+        assert self.train_metrics
+
+    def on_epoch_begin(self, epoch=None, logs=None):
+        self.steps = self.params['steps']
+        self.epoch = epoch
+        self.train_step = 0
+        if self.verbose and self.epochs and ParallelEnv().local_rank == 0:
+            print('Epoch %d/%d' % (epoch + 1, self.epochs))
+        self.train_progbar = ProgressBar(num=self.steps, verbose=self.verbose)
+
+    def _updates(self, logs, mode):
+        values = []
+        metrics = getattr(self, '%s_metrics' % (mode))
+        progbar = getattr(self, '%s_progbar' % (mode))
+        steps = getattr(self, '%s_step' % (mode))
+        for k in metrics:
+            if k in logs:
+                values.append((k, logs[k]))
+        progbar.update(steps, values)
+
+    def on_train_batch_end(self, step, logs=None):
+        logs = logs or {}
+        self.train_step += 1
+
+        if self.train_step % self.log_freq == 0 and self.verbose and ParallelEnv(
+        ).local_rank == 0:
+            # if steps is not None, last step will update in on_epoch_end
+            if self.steps and self.train_step < self.steps:
+                self._updates(logs, 'train')
+            else:
+                self._updates(logs, 'train')
+
+    def on_epoch_end(self, epoch, logs=None):
+        logs = logs or {}
+        if self.verbose and ParallelEnv().local_rank == 0:
+            self._updates(logs, 'train')
+
+    def on_eval_begin(self, logs=None):
+        self.eval_steps = logs.get('steps', None)
+        self.eval_metrics = logs.get('metrics_name', [])
+        self.eval_step = 0
+        self.evaled_samples = 0
+        self.eval_progbar = ProgressBar(
+            num=self.eval_steps, verbose=self.verbose)
+        if ParallelEnv().local_rank == 0:
+            print('Eval begin...')
+
+    def on_eval_batch_end(self, step, logs=None):
+        logs = logs or {}
+        self.eval_step = step
+        samples = logs.get('batch_size', 1)
+        self.evaled_samples += samples
+
+        if self.eval_step % self.log_freq == 0 and self.verbose and ParallelEnv(
+        ).local_rank == 0:
+            # if steps is not None, last step will update in on_epoch_end
+            if self.eval_steps and self.eval_step < self.eval_steps:
+                self._updates(logs, 'eval')
+
+    def on_eval_end(self, logs=None):
+        logs = logs or {}
+        if self.verbose and ParallelEnv().local_rank == 0:
+            self._updates(logs, 'eval')
+            print('Eval samples: %d' % (self.evaled_samples))
+
+
+class ModelCheckpoint(Callback):
+    def __init__(self, save_freq=1, save_dir=None):
+        self.save_freq = save_freq
+        self.save_dir = save_dir
+
+    def on_epoch_begin(self, epoch=None, logs=None):
+        self.epoch = epoch
+
+    def _is_save(self):
+        return self.model and self.save_dir and ParallelEnv().local_rank == 0
+
+    def on_epoch_end(self, epoch, logs=None):
+        if self._is_save() and self.epoch % self.save_freq == 0:
+            path = '{}/{}'.format(self.save_dir, epoch)
+            print('save checkpoint at {}'.format(path))
+            self.model.save(path)
+
+    def on_train_end(self, logs=None):
+        if self._is_save():
+            path = '{}/final'.format(self.save_dir)
+            print('save checkpoint at {}'.format(path))
+            self.model.save(path)
--- a/hapi/distributed.py
+++ b/hapi/distributed.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+import six
+import time
+import math
+import socket
+import contextlib
+import numpy as np
+
+from paddle import fluid
+from paddle.fluid.layers import collective
+from paddle.fluid.dygraph.parallel import ParallelEnv, ParallelStrategy
+from paddle.fluid.io import BatchSampler
+
+_parallel_context_initialized = False
+
+
+class DistributedBatchSampler(BatchSampler):
+    """Sampler that restricts data loading to a subset of the dataset.
+
+    In such case, each process can pass a DistributedBatchSampler instance 
+    as a DataLoader sampler, and load a subset of the original dataset that 
+    is exclusive to it.
+
+    .. note::
+        Dataset is assumed to be of constant size.
+        
+    Args:
+        data_source: this could be a `fluid.io.Dataset` implement
+                     or other python object which implemented
+                     `__len__` for BatchSampler to get sample
+                     number of data source.
+        batch_size(int): sample indice number in a mini-batch indices.
+        shuffle(bool): whther to shuffle indices order before genrating
+            batch indices. Default False.
+        drop_last(bool): whether drop the last incomplete batch dataset size
+            is not divisible by the batch size. Default False
+    """
+
+    def __init__(self, dataset, batch_size, shuffle=False, drop_last=False):
+        self.dataset = dataset
+
+        assert isinstance(batch_size, int) and batch_size > 0, \
+                "batch_size should be a positive integer"
+        self.batch_size = batch_size
+        assert isinstance(shuffle, bool), \
+                "shuffle should be a boolean value"
+        self.shuffle = shuffle
+        assert isinstance(drop_last, bool), \
+                "drop_last should be a boolean number"
+
+        self.drop_last = drop_last
+        self.nranks = ParallelEnv().nranks
+        self.local_rank = ParallelEnv().local_rank
+        self.epoch = 0
+        self.num_samples = int(
+            math.ceil(len(self.dataset) * 1.0 / self.nranks))
+        self.total_size = self.num_samples * self.nranks
+
+    def __iter__(self):
+        num_samples = len(self.dataset)
+        indices = np.arange(num_samples).tolist()
+        indices += indices[:(self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+        if self.shuffle:
+            np.random.RandomState(self.epoch).shuffle(indices)
+            self.epoch += 1
+
+        # subsample
+        def _get_indices_by_batch_size(indices):
+            subsampled_indices = []
+            last_batch_size = self.total_size % (self.batch_size * self.nranks)
+            assert last_batch_size % self.nranks == 0
+            last_local_batch_size = last_batch_size // self.nranks
+
+            for i in range(self.local_rank * self.batch_size,
+                           len(indices) - last_batch_size,
+                           self.batch_size * self.nranks):
+                subsampled_indices.extend(indices[i:i + self.batch_size])
+
+            indices = indices[len(indices) - last_batch_size:]
+            subsampled_indices.extend(indices[
+                self.local_rank * last_local_batch_size:(
+                    self.local_rank + 1) * last_local_batch_size])
+            return subsampled_indices
+
+        if self.nranks > 1:
+            indices = _get_indices_by_batch_size(indices)
+
+        assert len(indices) == self.num_samples
+        _sample_iter = iter(indices)
+
+        batch_indices = []
+        for idx in _sample_iter:
+            batch_indices.append(idx)
+            if len(batch_indices) == self.batch_size:
+                yield batch_indices
+                batch_indices = []
+        if not self.drop_last and len(batch_indices) > 0:
+            yield batch_indices
+
+    def __len__(self):
+        num_samples = self.num_samples
+        num_samples += int(not self.drop_last) * (self.batch_size - 1)
+        return num_samples // self.batch_size
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+
+
+def _all_gather(x, nranks, ring_id=0, use_calc_stream=True):
+    return collective._c_allgather(
+        x, nranks, ring_id=ring_id, use_calc_stream=use_calc_stream)
+
+
+def wait_server_ready(endpoints):
+    assert not isinstance(endpoints, six.string_types)
+    while True:
+        all_ok = True
+        not_ready_endpoints = []
+        for ep in endpoints:
+            ip_port = ep.split(":")
+            with contextlib.closing(
+                    socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+                sock.settimeout(2)
+                result = sock.connect_ex((ip_port[0], int(ip_port[1])))
+                if result != 0:
+                    all_ok = False
+                    not_ready_endpoints.append(ep)
+        if not all_ok:
+            time.sleep(3)
+        else:
+            break
+
+
+def init_communicator(program, rank, nranks, wait_port, current_endpoint,
+                      endpoints):
+    if nranks < 2:
+        return
+    other_endpoints = endpoints[:]
+    other_endpoints.remove(current_endpoint)
+    if rank == 0 and wait_port:
+        wait_server_ready(other_endpoints)
+    block = program.global_block()
+    nccl_id_var = block.create_var(
+        name=fluid.unique_name.generate('nccl_id'),
+        persistable=True,
+        type=fluid.core.VarDesc.VarType.RAW)
+
+    block.append_op(
+        type='c_gen_nccl_id',
+        inputs={},
+        outputs={'Out': nccl_id_var},
+        attrs={
+            'rank': rank,
+            'endpoint': current_endpoint,
+            'other_endpoints': other_endpoints
+        })
+
+    block.append_op(
+        type='c_comm_init',
+        inputs={'X': nccl_id_var},
+        outputs={},
+        attrs={
+            'nranks': nranks,
+            'rank': rank,
+            'ring_id': 0,
+        })
+
+
+def prepare_distributed_context(place=None):
+    if place is None:
+        place = fluid.CUDAPlace(ParallelEnv().dev_id) if ParallelEnv().nranks > 1 \
+            else fluid.CUDAPlace(0)
+
+    strategy = ParallelStrategy()
+    strategy.nranks = ParallelEnv().nranks
+    strategy.local_rank = ParallelEnv().local_rank
+    strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
+    strategy.current_endpoint = ParallelEnv().current_endpoint
+
+    if strategy.nranks < 2:
+        return
+
+    global _parallel_context_initialized
+
+    if not _parallel_context_initialized and isinstance(place,
+                                                        fluid.CUDAPlace):
+
+        def _init_context():
+            communicator_prog = fluid.Program()
+            init_communicator(communicator_prog, strategy.local_rank,
+                              strategy.nranks, True, strategy.current_endpoint,
+                              strategy.trainer_endpoints)
+            exe = fluid.Executor(place)
+            exe.run(communicator_prog)
+
+        if fluid.in_dygraph_mode():
+            fluid.disable_dygraph()
+            _init_context()
+            fluid.enable_dygraph(place)
+        else:
+            _init_context()
+
+    else:
+        assert ("Only support CUDAPlace for now.")
+
+    _parallel_context_initialized = True
+    return strategy
--- a/hapi/metrics.py
+++ b/hapi/metrics.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+
+import six
+import abc
+import numpy as np
+import paddle.fluid as fluid
+
+import logging
+FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
+logging.basicConfig(level=logging.INFO, format=FORMAT)
+logger = logging.getLogger(__name__)
+
+__all__ = ['Metric', 'Accuracy']
+
+
+@six.add_metaclass(abc.ABCMeta)
+class Metric(object):
+    """
+    Base class for metric, encapsulates metric logic and APIs
+
+    Usage:
+    m = SomeMetric()
+    for prediction, label in ...:
+        m.update(prediction, label)
+    m.accumulate()
+    """
+
+    @abc.abstractmethod
+    def reset(self):
+        """
+        Reset states and result
+        """
+        raise NotImplementedError("function 'reset' not implemented in {}.".
+                                  format(self.__class__.__name__))
+
+    @abc.abstractmethod
+    def update(self, *args, **kwargs):
+        """
+        Update states for metric
+        """
+        raise NotImplementedError("function 'update' not implemented in {}.".
+                                  format(self.__class__.__name__))
+
+    @abc.abstractmethod
+    def accumulate(self):
+        """
+        Accumulates statistics, computes and returns the metric value
+        """
+        raise NotImplementedError(
+            "function 'accumulate' not implemented in {}.".format(
+                self.__class__.__name__))
+
+    @abc.abstractmethod
+    def name(self):
+        """
+        Returns metric name
+        """
+        raise NotImplementedError("function 'name' not implemented in {}.".
+                                  format(self.__class__.__name__))
+
+    def add_metric_op(self, pred, label):
+        """
+        Add process op for metric in program
+        """
+        return pred, label
+
+
+class Accuracy(Metric):
+    """
+    Encapsulates accuracy metric logic
+    """
+
+    def __init__(self, topk=(1, ), name=None, *args, **kwargs):
+        super(Accuracy, self).__init__(*args, **kwargs)
+        self.topk = topk
+        self.maxk = max(topk)
+        self._init_name(name)
+        self.reset()
+
+    def add_metric_op(self, pred, label, *args, **kwargs):
+        pred = fluid.layers.argsort(pred[0], descending=True)[1][:, :self.maxk]
+        correct = pred == label[0]
+        return correct
+
+    def update(self, correct, *args, **kwargs):
+        accs = []
+        for i, k in enumerate(self.topk):
+            num_corrects = correct[:, :k].sum()
+            num_samples = len(correct)
+            accs.append(float(num_corrects) / num_samples)
+            self.total[i] += num_corrects
+            self.count[i] += num_samples
+        return accs
+
+    def reset(self):
+        self.total = [0.] * len(self.topk)
+        self.count = [0] * len(self.topk)
+
+    def accumulate(self):
+        res = []
+        for t, c in zip(self.total, self.count):
+            res.append(float(t) / c)
+        return res
+
+    def _init_name(self, name):
+        name = name or 'acc'
+        if self.maxk != 1:
+            self._name = ['{}_top{}'.format(name, k) for k in self.topk]
+        else:
+            self._name = ['acc']
+
+    def name(self):
+        return self._name
--- a/hapi/model.py
+++ b/hapi/model.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+
+import inspect
+import os
+import pickle
+import numpy as np
+import six
+import warnings
+import tqdm
+
+from collections import Iterable
+from paddle import fluid
+from paddle.fluid.framework import in_dygraph_mode, Variable
+from paddle.fluid.executor import global_scope
+from paddle.fluid.io import is_belong_to_optimizer
+from paddle.fluid.dygraph.base import to_variable
+from paddle.fluid.dygraph.parallel import ParallelEnv
+from paddle.fluid.layers.utils import flatten
+from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
+from paddle.fluid.incubate.fleet.base import role_maker
+from paddle.fluid.io import DataLoader, Dataset
+
+from hapi.distributed import DistributedBatchSampler, _all_gather, prepare_distributed_context, _parallel_context_initialized
+from hapi.metrics import Metric
+from hapi.callbacks import config_callbacks
+
+__all__ = [
+    'Model', 'Loss', 'CrossEntropy', 'Input', 'set_device',
+    'SoftmaxWithCrossEntropy'
+]
+
+
+def set_device(device):
+    assert isinstance(device, six.string_types) and device.lower() in ['cpu', 'gpu'], \
+    "Expected device in ['cpu', 'gpu'], but got {}".format(device)
+
+    place = fluid.CUDAPlace(ParallelEnv().dev_id) \
+            if device.lower() == 'gpu' and fluid.is_compiled_with_cuda() \
+                else fluid.CPUPlace()
+
+    return place
+
+
+def to_list(value):
+    if value is None:
+        return value
+    if isinstance(value, (list, tuple)):
+        return value
+    return [value]
+
+
+def to_numpy(var):
+    assert isinstance(var, (Variable, fluid.core.VarBase)), "not a variable"
+    if isinstance(var, fluid.core.VarBase):
+        return var.numpy()
+    t = global_scope().find_var(var.name).get_tensor()
+    return np.array(t)
+
+
+def flatten_list(l):
+    assert isinstance(l, list), "not a list"
+    outl = []
+    splits = []
+    for sl in l:
+        assert isinstance(sl, list), "sub content not a list"
+        splits.append(len(sl))
+        outl += sl
+    return outl, splits
+
+
+def restore_flatten_list(l, splits):
+    outl = []
+    for split in splits:
+        assert len(l) >= split, "list length invalid"
+        sl, l = l[:split], l[split:]
+        outl.append(sl)
+    return outl
+
+
+def extract_args(func):
+    if hasattr(inspect, 'getfullargspec'):
+        return inspect.getfullargspec(func)[0]
+    else:
+        return inspect.getargspec(func)[0]
+
+
+class Input(fluid.dygraph.Layer):
+    def __init__(self, shape=None, dtype=None, name=None):
+        super(Input, self).__init__()
+        self.shape = shape
+        self.dtype = dtype
+        self.name = name
+
+    def forward(self):
+        return fluid.data(self.name, shape=self.shape, dtype=self.dtype)
+
+
+class Loss(object):
+    def __init__(self, average=True):
+        super(Loss, self).__init__()
+        self.average = average
+
+    def forward(self, outputs, labels):
+        raise NotImplementedError()
+
+    def __call__(self, outputs, labels):
+        labels = to_list(labels)
+        if in_dygraph_mode():
+            labels = [to_variable(l) for l in labels]
+        losses = to_list(self.forward(to_list(outputs), labels))
+        if self.average:
+            losses = [fluid.layers.reduce_mean(l) for l in losses]
+        else:
+            losses = [fluid.layers.reduce_sum(l) for l in losses]
+        return losses
+
+
+class CrossEntropy(Loss):
+    def __init__(self, average=True):
+        super(CrossEntropy, self).__init__()
+
+    def forward(self, outputs, labels):
+        return [
+            fluid.layers.cross_entropy(o, l) for o, l in zip(outputs, labels)
+        ]
+
+
+class SoftmaxWithCrossEntropy(Loss):
+    def __init__(self, average=True):
+        super(SoftmaxWithCrossEntropy, self).__init__()
+
+    def forward(self, outputs, labels):
+        return [
+            fluid.layers.softmax_with_cross_entropy(
+                o, l, return_softmax=False) for o, l in zip(outputs, labels)
+        ]
+
+
+class StaticGraphAdapter(object):
+    def __init__(self, model):
+        super(StaticGraphAdapter, self).__init__()
+        self.model = model
+        # with `_build_once` gone, parameters are now created in `__init__`
+        # so we need to keep track of the parameters already created
+        self._startup_prog = fluid.default_startup_program()
+        self._orig_prog = fluid.default_main_program()
+
+        self._label_vars = {}  # label variables
+        self._input_vars = {}  # label variables
+        self._endpoints = {}
+        self._loss_endpoint = None
+        self._executor = None
+        self._progs = {}
+        self._compiled_progs = {}
+
+        self._merge_count = {
+            'eval_total': 0,
+            'test_total': 0,
+            'eval_batch': 0,
+            'test_batch': 0
+        }
+
+        self._nranks = ParallelEnv().nranks
+        self._local_rank = ParallelEnv().local_rank
+
+    @property
+    def mode(self):
+        return self.model.mode
+
+    @mode.setter
+    def mode(self, value):
+        self.model.mode = value
+
+    def train(self, inputs, labels=None):
+        assert self.model._optimizer, \
+            "model not ready, please call `model.prepare()` first"
+        self.mode = 'train'
+        return self._run(inputs, labels)
+
+    def eval(self, inputs, labels=None):
+        self.mode = 'eval'
+        return self._run(inputs, labels)
+
+    def test(self, inputs):
+        self.mode = 'test'
+        return self._run(inputs, None)
+
+    def parameters(self, *args, **kwargs):
+        return super(Model, self.model).parameters(*args, **kwargs)
+
+    def save(self, path):
+        def _save(state, path):
+            if not state:
+                return
+            state = {
+                k: to_numpy(v) if isinstance(v, Variable) else v
+                for k, v in state.items()
+            }
+            with open(path, 'wb') as f:
+                pickle.dump(state, f)
+
+        base = os.path.basename(path)
+        assert base != "", "path should be of 'dirname/filename' format"
+        dir_name = os.path.dirname(path)
+        if dir_name and not os.path.exists(dir_name):
+            os.makedirs(dir_name)
+        param_path = path + ".pdparams"
+        _save(self.model.state_dict(), param_path)
+        prog = self._progs.get('train', None)
+        if prog is None or self.model._optimizer is None:
+            return
+        # XXX `optimizer.state_dict()` only work in dygraph mode
+        optim_path = path + ".pdopt"
+        optim = {
+            p.name: p
+            for p in filter(is_belong_to_optimizer, prog.list_vars())
+        }
+        if not optim:
+            return
+
+        _save(optim, optim_path)
+
+    def load(self, param_state_pairs, optim_state):
+        if self._executor is None:
+            executor = fluid.Executor(fluid.CPUPlace())._default_executor
+        else:
+            executor = self._executor._default_executor
+
+        # restore parameter states
+        fluid.core._create_loaded_parameter(
+            [param for param, state in param_state_pairs],
+            global_scope(), executor)
+        for param, state in param_state_pairs:
+            self._set_var(param, state)
+
+        # restore optimizer states
+        # FIXME what if a different optimizer is used?
+        if not self.model._optimizer or not optim_state:
+            return
+        self._load_optimizer(optim_state, executor)
+
+    def _load_optimizer(self, state, executor):
+        prog = self._progs.get('train', None)
+        optim = list(filter(is_belong_to_optimizer, prog.list_vars()))
+        if not optim:
+            return
+
+        fluid.core._create_loaded_parameter(optim, global_scope(), executor)
+
+        converted_state = dict(state)
+        for var in optim:
+            if var.name in ["@LR_DECAY_COUNTER@", "global_step"]:
+                # When using learning rate scheduler, dygraph would name the
+                # global step var as "global_step" to save, while static-graph
+                # would has a state var named as "@LR_DECAY_COUNTER@".
+                # NOTE: dygraph saved global_step is 1 larger than that in
+                # static-graph, since the time of global_step to increase is
+                # different.
+                state_val = (
+                    np.array(converted_state.pop("global_step")) - 1
+                ) if "global_step" in converted_state else converted_state.pop(
+                    "@LR_DECAY_COUNTER@", None)
+                if state_val is not None:
+                    converted_state[var.name] = state_val
+            elif var.name.startswith("learning_rate_"):
+                # When using static learning rate, static-graph would make it
+                # a persistable var named 'unique_name.generate("learning_rate")',
+                # However, dygraph wouldn't save it.
+                if var.name not in state:
+                    continue
+            else:
+                # moment and other accumulators
+                if var.name not in converted_state:
+                    # try to convert from dygraph name
+                    opt_name = self.model._optimizer._name
+                    opt_cls_name = self.model._optimizer.__class__.__name__
+                    opt_unq_name = None
+                    for name in self.model._optimizer._accumulators.keys():
+                        accum_name = name if opt_name is None else name[len(
+                            opt_name) + 1:]
+                        for param_name, state_var in self.model._optimizer._accumulators[
+                                name].items():
+                            if opt_unq_name is None:
+                                # can not infer out the exact unique(opt_name),
+                                # thus try to extract rather than generate
+                                for state_key in sorted(
+                                        state.keys(),
+                                        key=lambda x: len(x),
+                                        reverse=True):
+                                    prefix = param_name + "_" + (
+                                        opt_cls_name if opt_name is None else
+                                        opt_name) + "_"
+                                    if state_key.startswith(prefix):
+                                        prefix_offset = state_key[len(
+                                            prefix):].find("_") + len(prefix)
+                                        opt_unq_name = state_key[len(
+                                            param_name + "_"):prefix_offset]
+                                        # TODO: assert
+                                        # assert opt_unq_name is None
+                                    # gen(param.name + "_" + gen(opt_name) + "_" + accum_name)
+                                    # always end with "_0" since the unique optimizer._name
+                            dy_state_name = (param_name + "_" + opt_unq_name +
+                                             "_" + accum_name + "_0")
+                            converted_state[
+                                state_var.name] = converted_state.pop(
+                                    dy_state_name)
+
+            assert var.name in converted_state, \
+                "variable [{}] is not in optimizer state file".format(var.name)
+            self._set_var(var, converted_state[var.name])
+
+    def _set_var(self, var, ndarray):
+        t = global_scope().find_var(var.name).get_tensor()
+        p = t._place()
+        if p.is_cpu_place():
+            place = fluid.CPUPlace()
+        elif p.is_cuda_pinned_place():
+            place = fluid.CUDAPinnedPlace()
+        else:
+            p = fluid.core.Place()
+            p.set_place(t._place())
+            place = fluid.CUDAPlace(p.gpu_device_id())
+
+        t.set(ndarray, place)
+
+    def _run(self, inputs, labels=None):
+        compiled_prog = self._compiled_progs.get(self.mode, None)
+        assert compiled_prog, \
+            "Model is not ready, please call `model.prepare()` first"
+
+        inputs = to_list(inputs)
+        if labels is not None:
+            labels = to_list(labels)
+        assert len(inputs) == len(self._input_vars[self.mode]), \
+            "number of inputs" \
+            + " does not match number of arguments of `forward` method"
+
+        feed = {}
+        input_names = [v.name for v in self._input_vars[self.mode]]
+        for idx, n in enumerate(input_names):
+            # train and test may take different arguments
+            if inputs[idx] is not None:
+                feed[n] = inputs[idx]
+        if labels is not None:
+            for idx, v in enumerate(self._label_vars[self.mode]):
+                feed[v.name] = labels[idx]
+
+        endpoints = self._endpoints[self.mode]
+        if self.mode == 'test':
+            fetch_list = endpoints['output']
+        else:
+            metric_list, metric_splits = flatten_list(endpoints['metric'])
+            fetch_list = endpoints['loss'] + metric_list
+            num_loss = len(endpoints['loss'])
+        rets = self._executor.run(compiled_prog,
+                                  feed=feed,
+                                  fetch_list=fetch_list,
+                                  return_numpy=False)
+        # LoDTensor cannot be fetch as numpy directly
+        rets = [np.array(v) for v in rets]
+        if self.mode == 'test':
+            return rets[:]
+        losses = rets[:num_loss]
+        metric_states = restore_flatten_list(rets[num_loss:], metric_splits)
+        metrics = []
+        for metric, state in zip(self.model._metrics, metric_states):
+            # cut off padding size
+            if self.mode != 'train' and self.model._test_dataloader is not None \
+                    and isinstance(self.model._test_dataloader, DataLoader) \
+                    and self._nranks > 1:
+                total_size = len(self.model._test_dataloader.dataset)
+                # TODO: fixme if have better way to get batch size
+                samples = state[0].shape[0]
+                current_count = self._merge_count.get(self.mode + '_total', 0)
+                if current_count + samples >= total_size:
+                    state = [
+                        s[:total_size - current_count, ...] for s in state
+                    ]
+                    self._merge_count[self.mode + '_total'] = 0
+                    self._merge_count[self.mode +
+                                      '_batch'] = total_size - current_count
+                else:
+                    self._merge_count[self.mode + '_total'] += samples
+                    self._merge_count[self.mode + '_batch'] = samples
+
+            metrics.append(metric.update(*state))
+        return (losses, metrics) if len(metrics) > 0 else losses
+
+    def prepare(self):
+        modes = ['train', 'eval', 'test']
+        for mode in modes:
+            self._make_program(mode)
+            self._compile_and_initialize(self._progs[mode], mode)
+
+    def _make_program(self, mode):
+        prog = self._progs.get(mode, None)
+        if prog is not None:
+            return
+
+        prog = self._orig_prog.clone()
+        # NOTE: When defining learning rate scheduling in static-graph, ops to
+        # increase the global step var and calculate learning rate would be
+        # prepended into _orig_prog. test program maked by `_orig_prog.clone`
+        # also would include these ops. Thus must prune these ops in test
+        # program, otherwise the global step would be changed in test.
+        if mode != 'train':
+            for op in list(prog.global_block().ops):
+                prog.global_block()._remove_op(0)
+        if mode == 'train' and self.model._optimizer \
+                and self.model._optimizer._learning_rate_map:
+            # HACK workaround learning rate map issue
+            lr_var = self.model._optimizer._learning_rate_map[self._orig_prog]
+            new_lr_var = prog.global_block().vars[lr_var.name]
+            self.model._optimizer._learning_rate_map[prog] = new_lr_var
+
+        losses = []
+        metrics = []
+        with fluid.program_guard(prog, self._startup_prog):
+            ins = self.model._inputs
+            lbls = self.model._labels if self.model._labels else []
+            inputs = [k.forward() for k in to_list(ins)]
+            labels = [k.forward() for k in to_list(lbls)]
+            self._label_vars[mode] = labels
+            outputs = to_list(self.model.forward(*inputs))
+
+            if mode != 'test' and self.model._loss_function:
+                losses = self.model._loss_function(outputs, labels)
+
+            if self._nranks > 1 and mode != 'train':
+                outputs = [_all_gather(o, self._nranks) for o in outputs]
+                if mode != 'test':
+                    labels = [_all_gather(l, self._nranks) for l in labels]
+
+            if mode != 'test':
+                for metric in self.model._metrics:
+                    metrics.append(
+                        to_list(metric.add_metric_op(outputs, labels)))
+
+            if mode == 'train' and self.model._optimizer:
+                self._loss_endpoint = fluid.layers.sum(losses)
+                if self._nranks > 1:
+                    role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+                    fleet.init(role)
+                    dist_strategy = DistributedStrategy()
+                    dist_strategy.mode = "collective"
+                    dist_strategy.collective_mode = "grad_allreduce"
+                    self.model._optimizer = fleet.distributed_optimizer(
+                        self.model._optimizer, strategy=dist_strategy)
+
+                self.model._optimizer.minimize(self._loss_endpoint)
+
+        if mode != 'train':  # clone again to put it in test mode
+            prog = prog.clone(for_test=True)
+
+        self._input_vars[mode] = inputs
+
+        self._progs[mode] = prog
+        self._endpoints[mode] = {
+            "output": outputs,
+            "loss": losses,
+            "metric": metrics
+        }
+
+    def _compile_and_initialize(self, prog, mode):
+        compiled_prog = self._compiled_progs.get(mode, None)
+        if compiled_prog is not None:
+            return compiled_prog
+
+        assert self.model._place is not None, \
+            "device is not set, please call `model.prepare()` first"
+
+        place = self.model._place
+
+        # XXX *ALL WEIGHTS* should be initialized upon model construction
+        # even if `forward()` may run different code path for different mode
+        # therefore startup program only needs to run once
+        if self._executor is None:
+            self._executor = fluid.Executor(place)
+            # XXX incremental initialization
+            uninitialized = []
+            for var_py in self._startup_prog.list_vars():
+                var = fluid.global_scope().find_var(var_py.name)
+                if not var_py.name.startswith('nccl_id') and var and \
+                        var.get_tensor()._is_initialized():
+                    continue
+
+                uninitialized.append(var_py)
+            if uninitialized:
+                startup_prog = self._startup_prog._prune(uninitialized)
+                self._executor.run(startup_prog)
+
+        if self._nranks < 2:
+            compiled_prog = fluid.CompiledProgram(prog)
+        else:
+            compiled_prog = prog
+
+        self._compiled_progs[mode] = compiled_prog
+
+
+class DynamicGraphAdapter(object):
+    def __init__(self, model):
+        super(DynamicGraphAdapter, self).__init__()
+        self.model = model
+        self._nranks = ParallelEnv().nranks
+        self._local_rank = ParallelEnv().local_rank
+        self._merge_count = {
+            'eval_total': 0,
+            'test_total': 0,
+            'eval_batch': 0,
+            'test_batch': 0
+        }
+
+        if self._nranks > 1:
+            stradegy = fluid.dygraph.parallel.ParallelStrategy()
+            stradegy.nranks = ParallelEnv().nranks
+            stradegy.local_rank = ParallelEnv().local_rank
+            stradegy.trainer_endpoints = ParallelEnv().trainer_endpoints
+            stradegy.current_endpoint = ParallelEnv().current_endpoint
+            self.ddp_model = fluid.dygraph.parallel.DataParallel(self.model,
+                                                                 stradegy)
+
+    @property
+    def mode(self):
+        return self.model.mode
+
+    @mode.setter
+    def mode(self, value):
+        self.model.mode = value
+
+    # TODO multi device in dygraph mode not implemented at present time
+    def train(self, inputs, labels=None):
+        assert self.model._optimizer, \
+            "model not ready, please call `model.prepare()` first"
+        super(Model, self.model).train()
+        self.mode = 'train'
+        inputs = to_list(inputs)
+        if labels is not None:
+            labels = [to_variable(l) for l in to_list(labels)]
+        if self._nranks > 1:
+            outputs = self.ddp_model.forward(*[to_variable(x) for x in inputs])
+            losses = self.model._loss_function(outputs, labels)
+            final_loss = fluid.layers.sum(losses)
+            final_loss = self.ddp_model.scale_loss(final_loss)
+            final_loss.backward()
+            self.ddp_model.apply_collective_grads()
+        else:
+            outputs = self.model.forward(*[to_variable(x) for x in inputs])
+            losses = self.model._loss_function(outputs, labels)
+            final_loss = fluid.layers.sum(losses)
+            final_loss.backward()
+
+        self.model._optimizer.minimize(final_loss)
+        self.model.clear_gradients()
+        metrics = []
+        for metric in self.model._metrics:
+            metric_outs = metric.add_metric_op(
+                to_list(outputs), to_list(labels))
+            m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)])
+            metrics.append(m)
+
+        return ([to_numpy(l) for l in losses], metrics) \
+            if len(metrics) > 0 else [to_numpy(l) for l in losses]
+
+    def eval(self, inputs, labels=None):
+        super(Model, self.model).eval()
+        self.mode = 'eval'
+        inputs = to_list(inputs)
+        if labels is not None:
+            labels = [to_variable(l) for l in to_list(labels)]
+        outputs = self.model.forward(*[to_variable(x) for x in inputs])
+        if self.model._loss_function:
+            losses = self.model._loss_function(outputs, labels)
+        else:
+            losses = []
+        if self._nranks > 1:
+            outputs = [_all_gather(o, self._nranks) for o in to_list(outputs)]
+            labels = [_all_gather(l, self._nranks) for l in labels]
+        metrics = []
+        for metric in self.model._metrics:
+            # cut off padding value.
+            if self.model._test_dataloader is not None and self._nranks > 1 \
+                    and isinstance(self.model._test_dataloader, DataLoader):
+                total_size = len(self.model._test_dataloader.dataset)
+                samples = outputs[0].shape[0]
+                current_count = self._merge_count.get(self.mode + '_total', 0)
+                if current_count + samples >= total_size:
+                    outputs = [o[:total_size - current_count] for o in outputs]
+                    labels = [l[:total_size - current_count] for l in labels]
+                    self._merge_count[self.mode + '_total'] = 0
+                    self._merge_count[self.mode +
+                                      '_batch'] = total_size - current_count
+                else:
+                    self._merge_count[self.mode + '_total'] += samples
+                    self._merge_count[self.mode + '_batch'] = samples
+
+            metric_outs = metric.add_metric_op(to_list(outputs), labels)
+            m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)])
+            metrics.append(m)
+
+        # To be consistent with static graph
+        # return empty loss if loss_function is None
+        return ([to_numpy(l) for l in losses], metrics) \
+            if len(metrics) > 0 else [to_numpy(l) for l in losses]
+
+    def test(self, inputs):
+        super(Model, self.model).eval()
+        self.mode = 'test'
+        inputs = [to_variable(x) for x in to_list(inputs)]
+        outputs = self.model.forward(*inputs)
+        if self._nranks > 1 and isinstance(self.model._place, fluid.CUDAPlace):
+            outputs = [_all_gather(o, self._nranks) for o in to_list(outputs)]
+
+        return [to_numpy(o) for o in to_list(outputs)]
+
+    def parameters(self, *args, **kwargs):
+        return super(Model, self.model).parameters(*args, **kwargs)
+
+    def save(self, path):
+        params = self.model.state_dict()
+        fluid.save_dygraph(params, path)
+        if self.model._optimizer is None:
+            return
+        if self.model._optimizer.state_dict():
+            optim = self.model._optimizer.state_dict()
+            fluid.save_dygraph(optim, path)
+
+    def load(self, param_state_pairs, optim_state):
+        # restore parameter states
+        for param, state in param_state_pairs:
+            param.set_value(state)
+
+        # resotre optimizer states
+        if not self.model._optimizer or not optim_state:
+            return
+
+        # If optimizer performs set_dict when state vars haven't been created,
+        # which would happen when set_dict before minimize, the state would be
+        # stored in optimizer._accumulators_holder and loaded lazily.
+        # To contrive this when loading from static-graph saved states, extend
+        # state dict to include keys named accoring to dygraph naming rules.
+        # TODO: if len(self.model._optimizer._accumulators) > 0
+        converted_state = dict(optim_state)
+        opt_unq_name = self.model._optimizer._name
+        opt_cls_name = self.model._optimizer.__class__.__name__
+        opt_name = opt_unq_name[:opt_unq_name.rfind("_")]  # remove suffix idx
+        param_names = [param.name for param in self.model.parameters()]
+        for var_name, state_var in sorted(
+                optim_state.items(), key=lambda x: len(x[0]), reverse=True):
+            if var_name in ["@LR_DECAY_COUNTER@", "global_step"]:
+                # NOTE: dygraph saved global_step is 1 larger than that in
+                # static-graph, since the time of global_step to increase is
+                # different.
+                if var_name == "@LR_DECAY_COUNTER@":
+                    converted_state["global_step"] = np.array(
+                        converted_state.pop("@LR_DECAY_COUNTER@")) + 1
+            else:
+                # moment and other accumulators
+                # extend state dict to include promising dygraph names
+                for param_name in param_names:
+                    if var_name.startswith(param_name + "_" + opt_name):
+                        # when init optimizer with name
+                        accum_name = var_name[len(param_name + "_" + opt_name +
+                                                  "_"):]
+                    elif var_name.startswith(param_name +
+                                             "_") and opt_name == opt_cls_name:
+                        # when init optimizer without name
+                        accum_name = var_name[len(param_name + "_"):]
+                    else:
+                        continue
+                    # remove suffix idx
+                    accum_name = accum_name[:accum_name.rfind("_")]
+                    # state names always end with "_0" in dygraph because of the
+                    # unique optimizer._name
+                    dy_state_name = (param_name + "_" + opt_unq_name + "_" +
+                                     accum_name + "_0")
+                    converted_state[dy_state_name] = state_var
+
+        self.model._optimizer.set_dict(converted_state)
+
+
+class Model(fluid.dygraph.Layer):
+    """
+    FIXME: add more comments and usage
+    """
+
+    def __init__(self):
+        super(Model, self).__init__(self.__class__.__name__)
+        self.mode = 'train'
+        self._inputs = None
+        self._labels = None
+        self._loss_function = None
+        self._loss_weights = None
+        self._optimizer = None
+        self._device = None
+        self._optimizer = None
+        self._test_dataloader = None
+
+        # init backend
+        if fluid.in_dygraph_mode():
+            self._adapter = DynamicGraphAdapter(self)
+        else:
+            self._adapter = StaticGraphAdapter(self)
+
+    def train(self, *args, **kwargs):
+        return self._adapter.train(*args, **kwargs)
+
+    def eval(self, *args, **kwargs):
+        return self._adapter.eval(*args, **kwargs)
+
+    def test(self, *args, **kwargs):
+        return self._adapter.test(*args, **kwargs)
+
+    def save(self, *args, **kwargs):
+        if ParallelEnv().local_rank == 0:
+            return self._adapter.save(*args, **kwargs)
+
+    def load(self, path, skip_mismatch=False, reset_optimizer=False):
+        """
+        Load from files storing the model states and optimizer states. The file
+        for optimizer states is not necessary if no need to restore the optimizer.
+
+        NOTE: parameters are retrieved out from the file storing model states
+        accoring to their structured names.
+
+        For fine-tuning or transfer-learning models where some of the layers have
+        changed, keep parameters needed to restore have same structured names in
+        the pre-trained model and fine-tuning model.
+
+        Args:
+            path (str): The prefix of files storing the model states and
+                optimizer states. The files would be `path.pdparams` and
+                `path.pdopt` separately, and the latter is not necessary
+                when no need to restore.
+            skip_mismatch (bool): Whether to skip the loading of mismatch
+                parameter or raise an error when mismatch happens (not found
+                the parameter in file storing model states of or receives a
+                mismatch shape).
+            reset_optimizer (bool): If True, ignore the providing file storing
+                optimizer states and initialize optimizer states from scratch.
+                Otherwise, restore optimizer states from `path.pdopt` if
+                a optimizer has been set to the model. Default False.
+        """
+
+        def _load_state_from_path(path):
+            if not os.path.exists(path):
+                return
+            with open(path, 'rb') as f:
+                return pickle.load(f) if six.PY2 else pickle.load(
+                    f, encoding='latin1')
+
+        def _check_match(key, param):
+            state = param_state.get(key, None)
+            if state is None:
+                raise ValueError(
+                    "{} is not found in the providing file.".format(key))
+            if list(state.shape) != list(param.shape):
+                raise ValueError(
+                    "{} receives a shape {}, but the expected shape is {}.".
+                    format(key, list(state.shape), list(param.shape)))
+            return param, state
+
+        param_state = _load_state_from_path(path + ".pdparams")
+        assert param_state, "Failed to load parameters, please check path."
+
+        matched_param_state = []
+        for key, param in self.state_dict().items():
+            try:
+                match_res = _check_match(key, param)
+            except ValueError as err:
+                if skip_mismatch:
+                    warnings.warn(
+                        ("Skip loading for {}. ".format(key) + err.message))
+                    # reset optimizer when mismatch happens
+                    reset_optimizer = True
+                else:
+                    raise err
+            matched_param_state.append(match_res)
+
+        optim_state = None if reset_optimizer else _load_state_from_path(
+            path + ".pdopt")
+        return self._adapter.load(matched_param_state, optim_state)
+
+    def parameters(self, *args, **kwargs):
+        return self._adapter.parameters(*args, **kwargs)
+
+    def prepare(self,
+                optimizer=None,
+                loss_function=None,
+                metrics=None,
+                inputs=None,
+                labels=None,
+                device=None):
+        """
+        FIXME: add comments
+        Args:
+            optimizer (Optimizer|None): optimizer must be set in training
+                and should be a Optimizer instance. It can be None in eval
+                and test mode.
+            loss_function (Loss|None): loss function must be set in training
+                and should be a Loss instance. It can be None when there is
+                no loss.
+            metrics (Metric|list of Metric|None): if metrics is set, all
+                metric will be calculate and output in train/eval mode.
+            inputs (Input|list|dict|None): inputs, entry points of network,
+                could be a Input layer, or lits of Input layers,
+                or dict (name: Input), or None. For static graph,
+                inputs must be set. For dynamic graph, it could be None.
+            labels (Input|list|None): labels, entry points of network,
+                could be a Input layer or lits of Input layers, or None.
+                For static graph, if set loss_function in Model.prepare(), it
+                must be set. Otherwise, it could be None.
+            device (str|None): specify device type, 'CPU' or 'GPU'.
+                If None, automatically select device according to
+                installation package version.
+        """
+
+        if isinstance(device, fluid.CUDAPlace) or \
+            (isinstance(device, six.string_types) and device.lower() == 'gpu') \
+            or (device is None and fluid.is_compiled_with_cuda()):
+            if isinstance(device, fluid.CUDAPlace):
+                self._place = device
+            else:
+                self._place = fluid.CUDAPlace(ParallelEnv().dev_id) \
+                    if ParallelEnv().nranks > 1 else fluid.CUDAPlace(0)
+
+            global _parallel_context_initialized
+            if ParallelEnv().nranks > 1 and not _parallel_context_initialized:
+                if fluid.in_dygraph_mode():
+                    fluid.disable_dygraph()
+                    fluid.enable_dygraph(self._place)
+                    fluid.dygraph.parallel.prepare_context()
+                else:
+                    prepare_distributed_context(self._place)
+
+                _parallel_context_initialized = True
+        elif isinstance(device, fluid.CPUPlace):
+            self._place = device
+        elif (isinstance(device, six.string_types) and device.lower() == 'cpu') \
+            or (device is None):
+            self._place = fluid.CPUPlace()
+        else:
+            raise ValueError(
+                "Expected device in ('gpu', 'cpu', fluid.CUDAPlace, fluid.CPUPlace, None), \
+                but got {}".format(device))
+
+        self._optimizer = optimizer
+        if loss_function:
+            if not isinstance(loss_function, Loss):
+                raise TypeError(
+                    "'loss_function' must be sub classes of 'Loss'")
+        self._loss_function = loss_function
+        if not in_dygraph_mode():
+            if not isinstance(inputs, (list, dict, Input)):
+                raise TypeError(
+                    "'inputs' must be list or dict in static graph mode")
+            if loss_function and not isinstance(labels, (list, Input)):
+                raise TypeError("'labels' must be list in static graph mode")
+
+        metrics = metrics or []
+        for metric in to_list(metrics):
+            assert isinstance(metric, Metric), \
+                "{} is not sub class of Metric".format(
+                    metric.__class__.__name__)
+        self._metrics = to_list(metrics)
+
+        self._inputs = to_list(inputs) if not isinstance(inputs, dict) else [
+            inputs[n] for n in extract_args(self.forward) if n != 'self'
+        ]
+        self._labels = to_list(labels)
+
+        if not in_dygraph_mode():
+            self._adapter.prepare()
+
+    def fit(
+            self,
+            train_data=None,
+            eval_data=None,
+            batch_size=1,
+            epochs=1,
+            eval_freq=1,
+            log_freq=10,
+            save_dir=None,
+            save_freq=1,
+            verbose=2,
+            drop_last=False,
+            shuffle=True,
+            num_workers=0,
+            callbacks=None, ):
+        """
+        FIXME: add more comments and usage
+        Args:
+            train_data (Dataset|DataLoader): An iterable data loader is used for 
+                train. An instance of paddle.fluid.io.Dataset or 
+                paddle.fluid.io.Dataloader is recomended.
+            eval_data (Dataset|DataLoader): An iterable data loader is used for
+                evaluation at the end of epoch. If None, will not do evaluation. 
+                An instance of paddle.fluid.io.Dataset or paddle.fluid.io.Dataloader 
+                is recomended.
+            batch_size (int): Integer number. The batch size of train_data and eval_data. 
+                When train_data and eval_data are both the instance of Dataloader, this 
+                parameter will be ignored.
+            epochs (int): Integer number. The number of epochs to train the model.
+            eval_freq (int): The frequency, in number of epochs, an evalutation
+                is performed.
+            log_freq (int): The frequency, in number of steps, the training logs
+                are printed.
+            save_dir(str|None): The directory to save checkpoint during training.
+                If None, will not save checkpoint.
+            save_freq (int): The frequency, in number of epochs, to save checkpoint.
+            verbose (int): The verbosity mode, should be 0, 1, or 2.
+                0 = silent, 1 = progress bar, 2 = one line per epoch.
+            drop_last (bool): whether drop the last incomplete batch of train_data 
+                when dataset size is not divisible by the batch size. When train_data 
+                is an instance of Dataloader, this parameter will be ignored.
+            shuffle (bool): whther to shuffle train_data. When train_data is an instance 
+                of Dataloader, this parameter will be ignored.
+            num_workers (int): the number of subprocess to load data, 0 for no subprocess 
+                used and loading data in main process. When train_data and eval_data are
+                both the instance of Dataloader, this parameter will be ignored.
+            callbacks (Callback|None): A list of `Callback` instances to apply
+                during training. If None, `ProgBarLogger` and `ModelCheckpoint`
+                are automatically inserted.
+        """
+
+        assert train_data is not None, \
+                "train_data must be given!"
+
+        if fluid.in_dygraph_mode():
+            feed_list = None
+        else:
+            feed_list = [x.forward() for x in self._inputs + self._labels]
+
+        if isinstance(train_data, Dataset):
+            train_sampler = DistributedBatchSampler(
+                train_data,
+                batch_size=batch_size,
+                shuffle=shuffle,
+                drop_last=drop_last)
+            train_loader = DataLoader(
+                train_data,
+                batch_sampler=train_sampler,
+                places=self._place,
+                feed_list=feed_list,
+                num_workers=num_workers,
+                return_list=True)
+        else:
+            train_loader = train_data
+
+        if eval_data is not None and isinstance(eval_data, Dataset):
+            eval_sampler = DistributedBatchSampler(
+                eval_data, batch_size=batch_size)
+            eval_loader = DataLoader(
+                eval_data,
+                batch_sampler=eval_sampler,
+                places=self._place,
+                feed_list=feed_list,
+                num_workers=num_workers,
+                return_list=True)
+        elif eval_data is not None:
+            eval_loader = eval_data
+        else:
+            eval_loader = None
+
+        do_eval = eval_loader is not None
+        self._test_dataloader = eval_loader
+        metrics_name = self._metrics_name()
+        steps = len(train_loader) if hasattr(train_loader, '__len__') else None
+        cbks = config_callbacks(
+            callbacks,
+            model=self,
+            epochs=epochs,
+            steps=steps,
+            log_freq=log_freq,
+            save_freq=save_freq,
+            save_dir=save_dir,
+            verbose=verbose,
+            metrics=self._metrics_name(), )
+
+        cbks.on_begin('train')
+        for epoch in range(epochs):
+
+            # FIXME: adapt to DataLoader
+            loader = train_loader
+            if not isinstance(train_loader, Iterable):
+                loader = train_loader()
+            logs = self._run_one_epoch(
+                loader, cbks, 'train', metrics_name, epoch=epoch)
+
+            if do_eval and epoch % eval_freq == 0:
+                # FIXME: adapt to DataLoader
+                loader = eval_loader
+                if not isinstance(eval_loader, Iterable):
+                    loader = eval_loader()
+
+                eval_steps = len(loader) if hasattr(loader,
+                                                    '__len__') else None
+                cbks.on_begin('eval', {
+                    'steps': eval_steps,
+                    'metrics_name': metrics_name
+                })
+
+                logs = self._run_one_epoch(loader, cbks, 'eval', metrics_name)
+
+                cbks.on_end('eval', logs)
+
+        cbks.on_end('train', logs)
+        self._test_dataloader = None
+
+    def evaluate(
+            self,
+            eval_data,
+            batch_size=1,
+            log_freq=10,
+            verbose=2,
+            num_workers=0,
+            callbacks=None, ):
+        """
+        FIXME: add more comments and usage
+        Args:
+            eval_data (Dataset|DataLoader): An iterable data loader is used for
+                evaluation. An instance of paddle.fluid.io.Dataset or 
+                paddle.fluid.io.Dataloader is recomended.
+            batch_size (int): Integer number. The batch size of train_data and eval_data. 
+                When train_data and eval_data are both the instance of Dataloader, this 
+                parameter will be ignored.
+            log_freq (int): The frequency, in number of steps, the eval logs
+                are printed.
+            verbose (int): The verbosity mode, should be 0, 1, or 2.
+                0 = silent, 1 = progress bar, 2 = one line per epoch.
+            num_workers (int): The number of subprocess to load data, 0 for no subprocess 
+                used and loading data in main process. When train_data and eval_data are
+                both the instance of Dataloader, this parameter will be ignored.
+            callbacks (Callback|None): A list of `Callback` instances to apply
+                during training. If None, `ProgBarLogger` and `ModelCheckpoint`
+                are automatically inserted.
+        """
+
+        if fluid.in_dygraph_mode():
+            feed_list = None
+        else:
+            feed_list = [x.forward() for x in self._inputs + self._labels]
+
+        if eval_data is not None and isinstance(eval_data, Dataset):
+            eval_sampler = DistributedBatchSampler(
+                eval_data, batch_size=batch_size)
+            eval_loader = DataLoader(
+                eval_data,
+                batch_sampler=eval_sampler,
+                places=self._place,
+                feed_list=feed_list,
+                num_workers=num_workers,
+                return_list=True)
+        else:
+            eval_loader = eval_data
+
+        self._test_dataloader = eval_loader
+        metrics_name = self._metrics_name()
+
+        cbks = config_callbacks(
+            callbacks,
+            model=self,
+            log_freq=log_freq,
+            verbose=verbose,
+            metrics=self._metrics_name(), )
+
+        loader = eval_loader
+        if not isinstance(eval_loader, Iterable):
+            loader = eval_loader()
+
+        eval_steps = len(loader) if hasattr(loader, '__len__') else None
+        cbks.on_begin('eval',
+                      {'steps': eval_steps,
+                       'metrics_name': metrics_name})
+
+        logs = self._run_one_epoch(loader, cbks, 'eval', metrics_name)
+
+        cbks.on_end('eval', logs)
+
+        self._test_dataloader = None
+
+        eval_result = {}
+        for k in self._metrics_name():
+            eval_result[k] = logs[k]
+
+        return eval_result
+
+    def predict(self, test_data, batch_size=1, num_workers=0):
+        """
+        FIXME: add more comments and usage
+        Args:
+            test_data (Dataset|DataLoader): An iterable data loader is used for
+                predict. An instance of paddle.fluid.io.Dataset or paddle.fluid.io.Dataloader 
+                is recomended.
+            batch_size (int): Integer number. The batch size of train_data and eval_data. 
+                When train_data and eval_data are both the instance of Dataloader, this 
+                parameter will be ignored.
+            num_workers (int): the number of subprocess to load data, 0 for no subprocess 
+                used and loading data in main process. When train_data and eval_data are
+                both the instance of Dataloader, this parameter will be ignored.
+        """
+
+        if fluid.in_dygraph_mode():
+            feed_list = None
+        else:
+            feed_list = [x.forward() for x in self._inputs + self._labels]
+
+        if test_data is not None and isinstance(test_data, Dataset):
+            test_sampler = DistributedBatchSampler(
+                test_data, batch_size=batch_size)
+            test_loader = DataLoader(
+                test_data,
+                batch_sampler=test_sampler,
+                places=self._place,
+                feed_list=feed_list,
+                num_workers=num_workers,
+                return_list=True)
+        else:
+            test_loader = test_data
+
+        self._test_dataloader = test_loader
+
+        loader = test_loader
+        if not isinstance(test_loader, Iterable):
+            loader = test_loader()
+
+        outputs = None
+        for data in tqdm.tqdm(loader):
+            if not fluid.in_dygraph_mode():
+                data = data[0]
+
+            outs = self.test(*data)
+
+            if outputs is None:
+                outputs = outs
+            else:
+                outputs = [
+                    np.vstack([x, outs[i]]) for i, x in enumerate(outputs)
+                ]
+
+        self._test_dataloader = None
+        if test_loader is not None and self._adapter._nranks > 1 \
+                    and isinstance(test_loader, DataLoader):
+            outputs = [o[:len(test_loader.dataset)] for o in outputs]
+        return outputs
+
+    def set_eval_data(self, eval_data):
+        """
+        Args:
+            eval_data (Dataset|DataLoader|None): An iterable data loader is used for 
+                eval. An instance of paddle.fluid.io.Dataset or 
+                paddle.fluid.io.Dataloader is recomended. 
+        """
+        assert isinstance(
+            eval_data,
+            DataLoader), "eval_data must be a instance of Dataloader!"
+        self._test_dataloader = eval_data
+
+    def _run_one_epoch(self,
+                       data_loader,
+                       callbacks,
+                       mode,
+                       metrics_name,
+                       epoch=None):
+        size = len(data_loader) if hasattr(data_loader, '__len__') else None
+        logs = {
+            'steps': size,
+            'metrics_name': metrics_name,
+        }
+
+        if mode == 'train':
+            assert epoch is not None, 'when mode is train, epoch must be given'
+            callbacks.on_epoch_begin(epoch)
+
+        for step, data in enumerate(data_loader):
+            # data might come from different types of data_loader and have
+            # different format, as following:
+            # 1. DataLoader in static graph:
+            #    [[input1, input2, ..., label1, lable2, ...]]
+            # 2. DataLoader in dygraph
+            #    [input1, input2, ..., label1, lable2, ...]
+            # 3. custumed iterator yield concated inputs and labels:
+            #   [input1, input2, ..., label1, lable2, ...]
+            # 4. custumed iterator yield seperated inputs and labels:
+            #   ([input1, input2, ...], [label1, lable2, ...])
+            # To handle all of these, flatten (nested) list to list.
+            data = flatten(data)
+            # LoDTensor.shape is callable, where LoDTensor comes from
+            # DataLoader in static graph
+            batch_size = data[0].shape()[0] if callable(data[
+                0].shape) else data[0].shape[0]
+
+            callbacks.on_batch_begin(mode, step, logs)
+            if mode == 'train':
+                outs = self.train(data[:len(self._inputs)],
+                                  data[len(self._inputs):])
+            else:
+                outs = self.eval(data[:len(self._inputs)],
+                                 data[len(self._inputs):])
+
+            # losses
+            loss = outs[0] if self._metrics else outs
+            metrics = [[l[0] for l in loss]]
+
+            # metrics
+            for metric in self._metrics:
+                res = metric.accumulate()
+                metrics.extend(to_list(res))
+
+            assert len(metrics_name) == len(metrics)
+            for k, v in zip(metrics_name, metrics):
+                logs[k] = v
+
+            logs['step'] = step
+            if mode == 'train' or self._adapter._merge_count.get(
+                    mode + '_batch', 0) <= 0:
+                logs['batch_size'] = batch_size * ParallelEnv().nranks
+            else:
+                logs['batch_size'] = self._adapter._merge_count[mode +
+                                                                '_batch']
+
+            callbacks.on_batch_end(mode, step, logs)
+        self._reset_metrics()
+
+        if mode == 'train':
+            assert epoch is not None, 'when mode is train, epoch must be given'
+            callbacks.on_epoch_end(epoch)
+
+        return logs
+
+    def _reset_metrics(self):
+        for metric in self._metrics:
+            metric.reset()
+
+    def _metrics_name(self):
+        metrics_name = ['loss']
+        for m in self._metrics:
+            metrics_name.extend(to_list(m.name()))
+        return metrics_name
--- a/hapi/progressbar.py
+++ b/hapi/progressbar.py
+import sys
+import time
+import numpy as np
+
+
+class ProgressBar(object):
+    """progress bar """
+
+    def __init__(self,
+                 num=None,
+                 width=30,
+                 verbose=1,
+                 start=True,
+                 file=sys.stdout):
+        self._num = num
+        if isinstance(num, int) and num <= 0:
+            raise TypeError('num should be None or integer (> 0)')
+        max_width = self._get_max_width()
+        self._width = width if width <= max_width else max_width
+        self._total_width = 0
+        self._verbose = verbose
+        self.file = file
+        self._values = {}
+        self._values_order = []
+        if start:
+            self._start = time.time()
+        self._last_update = 0
+
+        self._dynamic_display = (
+            (hasattr(self.file, 'isatty') and
+             self.file.isatty()) or 'ipykernel' in sys.modules or
+            'posix' in sys.modules or 'PYCHARM_HOSTED' in os.environ)
+
+    def _get_max_width(self):
+        if sys.version_info > (3, 3):
+            from shutil import get_terminal_size
+        else:
+            from backports.shutil_get_terminal_size import get_terminal_size
+        terminal_width, _ = get_terminal_size()
+        max_width = min(int(terminal_width * 0.6), terminal_width - 50)
+        return max_width
+
+    def start(self):
+        self.file.flush()
+        self._start = time.time()
+
+    def update(self, current_num, values=None):
+        now = time.time()
+
+        if current_num:
+            time_per_unit = (now - self._start) / current_num
+        else:
+            time_per_unit = 0
+
+        if time_per_unit >= 1 or time_per_unit == 0:
+            fps = ' - %.0fs/%s' % (time_per_unit, 'step')
+        elif time_per_unit >= 1e-3:
+            fps = ' - %.0fms/%s' % (time_per_unit * 1e3, 'step')
+        else:
+            fps = ' - %.0fus/%s' % (time_per_unit * 1e6, 'step')
+
+        info = ''
+        if self._verbose == 1:
+            prev_total_width = self._total_width
+
+            if self._dynamic_display:
+                sys.stdout.write('\b' * prev_total_width)
+                sys.stdout.write('\r')
+            else:
+                sys.stdout.write('\n')
+
+            if self._num is not None:
+                numdigits = int(np.log10(self._num)) + 1
+
+                bar_chars = ('step %' + str(numdigits) + 'd/%d [') % (
+                    current_num, self._num)
+                prog = float(current_num) / self._num
+                prog_width = int(self._width * prog)
+
+                if prog_width > 0:
+                    bar_chars += ('=' * (prog_width - 1))
+                    if current_num < self._num:
+                        bar_chars += '>'
+                    else:
+                        bar_chars += '='
+                bar_chars += ('.' * (self._width - prog_width))
+                bar_chars += ']'
+            else:
+                bar_chars = 'step %3d' % current_num
+
+            self._total_width = len(bar_chars)
+            sys.stdout.write(bar_chars)
+
+            for k, val in values:
+                info += ' - %s:' % k
+                val = val if isinstance(val, list) else [val]
+                for i, v in enumerate(val):
+                    if isinstance(v, (float, np.float32, np.float64)):
+                        if abs(v) > 1e-3:
+                            info += ' %.4f' % v
+                        else:
+                            info += ' %.4e' % v
+                    else:
+                        info += ' %s' % v
+
+            if self._num is not None and current_num < self._num:
+                eta = time_per_unit * (self._num - current_num)
+                if eta > 3600:
+                    eta_format = '%d:%02d:%02d' % (eta // 3600, (eta % 3600) //
+                                                   60, eta % 60)
+                elif eta > 60:
+                    eta_format = '%d:%02d' % (eta // 60, eta % 60)
+                else:
+                    eta_format = '%ds' % eta
+
+                info += ' - ETA: %s' % eta_format
+
+            info += fps
+            self._total_width += len(info)
+            if prev_total_width > self._total_width:
+                info += (' ' * (prev_total_width - self._total_width))
+
+            # newline for another epoch
+            if self._num is not None and current_num >= self._num:
+                info += '\n'
+            if self._num is None:
+                info += '\n'
+
+            sys.stdout.write(info)
+            sys.stdout.flush()
+            self._last_update = now
+        elif self._verbose == 2:
+            if self._num:
+                numdigits = int(np.log10(self._num)) + 1
+                count = ('step %' + str(numdigits) + 'd/%d') % (current_num,
+                                                                self._num)
+            else:
+                count = 'step %3d' % current_num
+            info = count + info
+
+            for k, val in values:
+                info += ' - %s:' % k
+                val = val if isinstance(val, list) else [val]
+                for v in val:
+                    if isinstance(v, (float, np.float32, np.float64)):
+                        if abs(v) > 1e-3:
+                            info += ' %.4f' % v
+                        else:
+                            info += ' %.4e' % v
+                    elif isinstance(v, np.ndarray) and \
+                        v.size == 1 and \
+                        isinstance(v.dtype, (np.float32, np.float64)):
+                        if abs(v[0]) > 1e-3:
+                            info += ' %.4f' % v[0]
+                        else:
+                            info += ' %.4e' % v[0]
+                    else:
+                        info += ' %s' % v
+
+            info += fps
+            info += '\n'
+            sys.stdout.write(info)
+            sys.stdout.flush()
--- a/hapi/text/__init__.py
+++ b/hapi/text/__init__.py
--- a/hapi/text/bert/__init__.py
+++ b/hapi/text/bert/__init__.py
--- a/hapi/text/bert/batching.py
+++ b/hapi/text/bert/batching.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Mask, padding and batching."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
+    """
+    Add mask for batch_tokens, return out, mask_label, mask_pos;
+    Note: mask_pos responding the batch_tokens after padded;
+    """
+    max_len = max([len(sent) for sent in batch_tokens])
+    mask_label = []
+    mask_pos = []
+    prob_mask = np.random.rand(total_token_num)
+    # Note: the first token is [CLS], so [low=1]
+    replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
+    pre_sent_len = 0
+    prob_index = 0
+    for sent_index, sent in enumerate(batch_tokens):
+        mask_flag = False
+        prob_index += pre_sent_len
+        for token_index, token in enumerate(sent):
+            prob = prob_mask[prob_index + token_index]
+            if prob > 0.15:
+                continue
+            elif 0.03 < prob <= 0.15:
+                # mask
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    sent[token_index] = MASK
+                    mask_flag = True
+                    mask_pos.append(sent_index * max_len + token_index)
+            elif 0.015 < prob <= 0.03:
+                # random replace
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    sent[token_index] = replace_ids[prob_index + token_index]
+                    mask_flag = True
+                    mask_pos.append(sent_index * max_len + token_index)
+            else:
+                # keep the original token
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    mask_pos.append(sent_index * max_len + token_index)
+        pre_sent_len = len(sent)
+
+        # ensure at least mask one word in a sentence
+        while not mask_flag:
+            token_index = int(np.random.randint(1, high=len(sent) - 1, size=1))
+            if sent[token_index] != SEP and sent[token_index] != CLS:
+                mask_label.append(sent[token_index])
+                sent[token_index] = MASK
+                mask_flag = True
+                mask_pos.append(sent_index * max_len + token_index)
+    mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
+    mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
+    return batch_tokens, mask_label, mask_pos
+
+
+def prepare_batch_data(insts,
+                       total_token_num,
+                       voc_size=0,
+                       pad_id=None,
+                       cls_id=None,
+                       sep_id=None,
+                       mask_id=None,
+                       return_input_mask=True,
+                       return_max_len=True,
+                       return_num_token=False):
+    """
+    1. generate Tensor of data
+    2. generate Tensor of position
+    3. generate self attention mask, [shape: batch_size *  max_len * max_len]
+    """
+
+    batch_src_ids = [inst[0] for inst in insts]
+    batch_pos_ids = [inst[1] for inst in insts]
+    batch_sent_ids = [inst[2] for inst in insts]
+    labels_list = []
+    # compatible with squad, whose example includes start/end positions, 
+    # or unique id
+
+    for i in range(3, len(insts[0]), 1):
+        labels = [inst[i] for inst in insts]
+        labels = np.array(labels).astype("int64").reshape([-1, 1])
+        labels_list.append(labels)
+
+    # First step: do mask without padding
+    if mask_id >= 0:
+        out, mask_label, mask_pos = mask(
+            batch_src_ids,
+            total_token_num,
+            vocab_size=voc_size,
+            CLS=cls_id,
+            SEP=sep_id,
+            MASK=mask_id)
+    else:
+        out = batch_src_ids
+    # Second step: padding
+    src_id, self_input_mask = pad_batch_data(
+        out, pad_idx=pad_id, return_input_mask=True)
+    pos_id = pad_batch_data(
+        batch_pos_ids,
+        pad_idx=pad_id,
+        return_pos=False,
+        return_input_mask=False)
+    sent_id = pad_batch_data(
+        batch_sent_ids,
+        pad_idx=pad_id,
+        return_pos=False,
+        return_input_mask=False)
+
+    if mask_id >= 0:
+        return_list = [
+            src_id, pos_id, sent_id, self_input_mask, mask_label, mask_pos
+        ] + labels_list
+    else:
+        return_list = [src_id, pos_id, sent_id, self_input_mask] + labels_list
+
+    return return_list if len(return_list) > 1 else return_list[0]
+
+
+def pad_batch_data(insts,
+                   pad_idx=0,
+                   return_pos=False,
+                   return_input_mask=False,
+                   return_max_len=False,
+                   return_num_token=False):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and input mask.
+    """
+    return_list = []
+    max_len = max(len(inst) for inst in insts)
+    # Any token included in dict can be used to pad, since the paddings' loss
+    # will be masked out by weights and make no effect on parameter gradients.
+
+    inst_data = np.array([
+        list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts
+    ])
+    return_list += [inst_data.astype("int64").reshape([-1, max_len])]
+
+    # position data
+    if return_pos:
+        inst_pos = np.array([
+            list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
+            for inst in insts
+        ])
+
+        return_list += [inst_pos.astype("int64").reshape([-1, max_len])]
+
+    if return_input_mask:
+        # This is used to avoid attention on paddings.
+        input_mask_data = np.array([[1] * len(inst) + [0] *
+                                    (max_len - len(inst)) for inst in insts])
+        input_mask_data = np.expand_dims(input_mask_data, axis=-1)
+        return_list += [input_mask_data.astype("float32")]
+
+    if return_max_len:
+        return_list += [max_len]
+
+    if return_num_token:
+        num_token = 0
+        for inst in insts:
+            num_token += len(inst)
+        return_list += [num_token]
+
+    return return_list if len(return_list) > 1 else return_list[0]
+
+
+if __name__ == "__main__":
+    pass
--- a/hapi/text/bert/bert.py
+++ b/hapi/text/bert/bert.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"bert"
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+import json
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, to_variable, Layer, guard
+
+from hapi.text.text import PrePostProcessLayer, TransformerEncoder
+from hapi.text.bert.utils.init import init_from_static_model
+
+
+class BertConfig(object):
+    def __init__(self, config_path):
+        self._config_dict = self._parse(config_path)
+
+    def _parse(self, config_path):
+        try:
+            with open(config_path) as json_file:
+                config_dict = json.load(json_file)
+        except Exception:
+            raise IOError("Error in parsing bert model config file '%s'" %
+                          config_path)
+        else:
+            return config_dict
+
+    def __getitem__(self, key):
+        return self._config_dict[key]
+
+    def print_config(self):
+        for arg, value in sorted(six.iteritems(self._config_dict)):
+            print('%s: %s' % (arg, value))
+        print('------------------------------------------------')
+
+
+class BertEncoder(Layer):
+    """
+    bert
+    """
+
+    def __init__(self, config, return_pooled_out=True, use_fp16=False):
+        super(BertEncoder, self).__init__()
+
+        self.config = config
+        self._emb_size = config['hidden_size']
+        self._n_layer = config['num_hidden_layers']
+        self._n_head = config['num_attention_heads']
+        self._voc_size = config['vocab_size']
+        self._max_position_seq_len = config['max_position_embeddings']
+        self._sent_types = config['type_vocab_size']
+        self._hidden_act = config['hidden_act']
+        self._prepostprocess_dropout = config['hidden_dropout_prob']
+        self._attention_dropout = config['attention_probs_dropout_prob']
+        self.return_pooled_out = return_pooled_out
+
+        self._word_emb_name = "word_embedding"
+        self._pos_emb_name = "pos_embedding"
+        self._sent_emb_name = "sent_embedding"
+        self._dtype = "float16" if use_fp16 else "float32"
+
+        self._param_initializer = fluid.initializer.TruncatedNormal(
+            scale=config['initializer_range'])
+
+        self._src_emb = Embedding(
+            size=[self._voc_size, self._emb_size],
+            param_attr=fluid.ParamAttr(
+                name=self._word_emb_name, initializer=self._param_initializer),
+            dtype=self._dtype)
+
+        self._pos_emb = Embedding(
+            size=[self._max_position_seq_len, self._emb_size],
+            param_attr=fluid.ParamAttr(
+                name=self._pos_emb_name, initializer=self._param_initializer),
+            dtype=self._dtype)
+
+        self._sent_emb = Embedding(
+            size=[self._sent_types, self._emb_size],
+            param_attr=fluid.ParamAttr(
+                name=self._sent_emb_name, initializer=self._param_initializer),
+            dtype=self._dtype)
+
+        self.pooled_fc = Linear(
+            input_dim=self._emb_size,
+            output_dim=self._emb_size,
+            param_attr=fluid.ParamAttr(
+                name="pooled_fc.w_0", initializer=self._param_initializer),
+            bias_attr="pooled_fc.b_0",
+            act="tanh")
+
+        self.pre_process_layer = PrePostProcessLayer(
+            "nd", self._emb_size, self._prepostprocess_dropout, None)
+
+        self._encoder = TransformerEncoder(
+            n_layer=self._n_layer,
+            n_head=self._n_head,
+            d_key=self._emb_size // self._n_head,
+            d_value=self._emb_size // self._n_head,
+            d_model=self._emb_size,
+            d_inner_hid=self._emb_size * 4,
+            prepostprocess_dropout=self._prepostprocess_dropout,
+            attention_dropout=self._attention_dropout,
+            relu_dropout=0,
+            preprocess_cmd="",
+            postprocess_cmd="dan",
+            ffn_fc1_act=self._hidden_act)
+
+    def init_parameters(self, param_path="", verbose=False):
+        init_from_static_model(param_path, self, self.config, verbose)
+
+    def forward(self, src_ids, position_ids, sentence_ids, input_mask):
+        """
+        forward
+        """
+        src_emb = self._src_emb(src_ids)
+        pos_emb = self._pos_emb(position_ids)
+        sent_emb = self._sent_emb(sentence_ids)
+
+        emb_out = src_emb + pos_emb
+        emb_out = emb_out + sent_emb
+
+        emb_out = self.pre_process_layer(emb_out)
+
+        self_attn_mask = fluid.layers.matmul(
+            x=input_mask, y=input_mask, transpose_y=True)
+        self_attn_mask = fluid.layers.scale(
+            x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
+        n_head_self_attn_mask = fluid.layers.stack(
+            x=[self_attn_mask] * self._n_head, axis=1)
+        n_head_self_attn_mask.stop_gradient = True
+
+        enc_output = self._encoder(emb_out, n_head_self_attn_mask)
+
+        if not self.return_pooled_out:
+            return enc_output
+        next_sent_feat = fluid.layers.slice(
+            input=enc_output, axes=[1], starts=[0], ends=[1])
+        next_sent_feat = self.pooled_fc(next_sent_feat)
+        next_sent_feat = fluid.layers.reshape(
+            next_sent_feat, shape=[-1, self._emb_size])
+
+        return enc_output, next_sent_feat
--- a/hapi/text/bert/data_processor.py
+++ b/hapi/text/bert/data_processor.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import io
+import os
+import types
+import csv
+import numpy as np
+
+import hapi.text.tokenizer.tokenization as tokenization
+from hapi.text.bert.batching import prepare_batch_data
+
+
+class DataProcessor(object):
+    """Base class for data converters for sequence classification data sets."""
+
+    def __init__(self, tokenizer, max_seq_len, in_tokens, random_seed=None):
+
+        self.max_seq_len = max_seq_len
+        self.tokenizer = tokenizer
+        self.vocab = self.tokenizer.vocab
+
+        self.in_tokens = in_tokens
+        np.random.seed(random_seed)
+
+        self.current_train_example = -1
+        self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
+        self.current_train_epoch = -1
+
+    def get_train_iter(self,
+                       data_dir,
+                       epoch_num=1,
+                       shuffle=True,
+                       shuffle_seed=None):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+
+    def get_dev_iter(self, data_dir):
+        """Gets a collection of `InputExample`s for the dev set."""
+        raise NotImplementedError()
+
+    def get_test_iter(self, data_dir):
+        """Gets a collection of `InputExample`s for prediction."""
+        raise NotImplementedError()
+
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+
+    def convert_example(self, index, example, labels, max_seq_len, tokenizer):
+        """Converts a single `InputExample` into a single `InputFeatures`."""
+        feature = convert_single_example(index, example, labels, max_seq_len,
+                                         tokenizer)
+        return feature
+
+    def _read_tsv(cls, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with io.open(input_file, "r", encoding="utf8") as f:
+            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+            lines = []
+            for line in reader:
+                lines.append(line)
+            return lines
+
+    def generate_instance(self, feature):
+        """
+        generate instance with given feature
+
+        Args:
+            feature: InputFeatures(object). A single set of features of data.
+        """
+        input_pos = list(range(len(feature.input_ids)))
+        return [
+            feature.input_ids, feature.segment_ids, input_pos, feature.label_id
+        ]
+
+    def generate_batch_data(self,
+                            batch_data,
+                            total_token_num,
+                            voc_size=-1,
+                            mask_id=-1,
+                            return_input_mask=True,
+                            return_max_len=False,
+                            return_num_token=False):
+        return prepare_batch_data(
+            batch_data,
+            total_token_num,
+            voc_size=-1,
+            pad_id=self.vocab["[PAD]"],
+            cls_id=self.vocab["[CLS]"],
+            sep_id=self.vocab["[SEP]"],
+            mask_id=-1,
+            return_input_mask=True,
+            return_max_len=False,
+            return_num_token=False)
+
+    def get_num_examples(self, phase):
+        """Get number of examples for train, dev or test."""
+        if phase not in ['train', 'dev', 'test']:
+            raise ValueError(
+                "Unknown phase, which should be in ['train', 'dev', 'test'].")
+        if phase == 'train':
+            return len(self.train_examples)
+        elif phase == 'dev':
+            return len(self.dev_examples)
+        elif phase == 'test':
+            return len(self.test_examples)
+        else:
+            raise ValueError(
+                "Unknown phase, which should be in ['train', 'dev', 'test'].")
+
+    def get_train_progress(self):
+        """Gets progress for training phase."""
+        return self.current_train_example, self.current_train_epoch
+
+    def data_generator(self, data_iter, batch_size, phase='train',
+                       dev_count=1):
+        """
+        Generate data for train, dev or test.
+    
+        Args:
+          batch_size: int. The batch size of generated data.
+          phase: string. The phase for which to generate data.
+        """
+        assert phase in ['train', 'dev', 'test']
+        if phase == 'train':
+            sample_num = len(self.train_examples)
+        elif phase == 'dev':
+            sample_num = len(self.dev_examples)
+        elif phase == 'test':
+            sample_num = len(self.test_examples)
+        else:
+            sample_num = -1
+        self.num_examples[phase] = sample_num
+
+        def instance_reader():
+            for epoch_idx, example_idx, example in data_iter():
+                if phase == 'train':
+                    self.current_train_epoch = epoch_idx
+                    self.current_train_example = example_idx
+                feature = self.convert_example(
+                    example_idx, example,
+                    self.get_labels(), self.max_seq_len, self.tokenizer)
+
+                instance = self.generate_instance(feature)
+                yield instance
+
+        def batch_reader(reader, batch_size, in_tokens):
+            batch, total_token_num, max_len = [], 0, 0
+            for instance in reader():
+                token_ids, sent_ids, pos_ids, label = instance[:4]
+                max_len = max(max_len, len(token_ids))
+                if in_tokens:
+                    to_append = (len(batch) + 1) * max_len <= batch_size
+                else:
+                    to_append = len(batch) < batch_size
+                if to_append:
+                    batch.append(instance)
+                    total_token_num += len(token_ids)
+                else:
+                    yield batch, total_token_num
+                    batch, total_token_num, max_len = [instance], len(
+                        token_ids), len(token_ids)
+
+            if len(batch) > 0:
+                yield batch, total_token_num
+
+        def wrapper():
+            all_dev_batches = []
+            for batch_data, total_token_num in batch_reader(
+                    instance_reader, batch_size, self.in_tokens):
+                batch_data = self.generate_batch_data(
+                    batch_data,
+                    total_token_num,
+                    voc_size=-1,
+                    mask_id=-1,
+                    return_input_mask=True,
+                    return_max_len=False,
+                    return_num_token=False)
+                if len(all_dev_batches) < dev_count:
+                    all_dev_batches.append(batch_data)
+
+                if len(all_dev_batches) == dev_count:
+                    for batch in all_dev_batches:
+                        yield batch
+                    all_dev_batches = []
+
+        return wrapper
+
+
+class InputExample(object):
+    """A single training/test example for simple sequence classification."""
+
+    def __init__(self, guid, text_a, text_b=None, label=None):
+        """Constructs a InputExample.
+
+    Args:
+      guid: Unique id for the example.
+      text_a: string. The untokenized text of the first sequence. For single
+        sequence tasks, only this sequence must be specified.
+      text_b: (Optional) string. The untokenized text of the second sequence.
+        Only must be specified for sequence pair tasks.
+      label: (Optional) string. The label of the example. This should be
+        specified for train and dev examples, but not for test examples.
+    """
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self, input_ids, input_mask, segment_ids, label_id):
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.label_id = label_id
+
+
+class XnliProcessor(DataProcessor):
+    """Processor for the XNLI data set."""
+
+    def get_train_iter(self,
+                       data_dir,
+                       epoch_num=1,
+                       shuffle=True,
+                       shuffle_seed=None):
+        """See base class."""
+        self.language = "zh"
+        lines = self._read_tsv(
+            os.path.join(data_dir, "multinli", "multinli.train.%s.tsv" %
+                         self.language))
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "train-%d" % (i)
+            text_a = tokenization.convert_to_unicode(line[0])
+            text_b = tokenization.convert_to_unicode(line[1])
+            label = tokenization.convert_to_unicode(line[2])
+            if label == tokenization.convert_to_unicode("contradictory"):
+                label = tokenization.convert_to_unicode("contradiction")
+            examples.append(
+                InputExample(
+                    guid=guid, text_a=text_a, text_b=text_b, label=label))
+
+        self.train_examples = examples
+
+        def wrapper():
+            if shuffle:
+                if shuffle_seed is not None:
+                    np.random.seed(shuffle_seed)
+            for epoch_idx in range(epoch_num):
+                if shuffle:
+                    np.random.shuffle(examples)
+                for (example_idx, example) in enumerate(examples):
+                    yield epoch_idx, example_idx, example
+
+        return wrapper
+
+    def get_dev_iter(self, data_dir):
+        """See base class."""
+        self.language = "zh"
+        lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "dev-%d" % (i)
+            language = tokenization.convert_to_unicode(line[0])
+            if language != tokenization.convert_to_unicode(self.language):
+                continue
+            text_a = tokenization.convert_to_unicode(line[6])
+            text_b = tokenization.convert_to_unicode(line[7])
+            label = tokenization.convert_to_unicode(line[1])
+            examples.append(
+                InputExample(
+                    guid=guid, text_a=text_a, text_b=text_b, label=label))
+
+        self.dev_examples = examples
+
+        def wrapper():
+            for (example_idx, example) in enumerate(examples):
+                yield 0, example_idx, example
+
+        return wrapper
+
+    def get_test_iter(self, data_dir):
+        """See base class."""
+        self.language = "zh"
+        lines = self._read_tsv(os.path.join(data_dir, "xnli.test.tsv"))
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "test-%d" % (i)
+            language = tokenization.convert_to_unicode(line[0])
+            if language != tokenization.convert_to_unicode(self.language):
+                continue
+            text_a = tokenization.convert_to_unicode(line[6])
+            text_b = tokenization.convert_to_unicode(line[7])
+            label = tokenization.convert_to_unicode(line[1])
+            examples.append(
+                InputExample(
+                    guid=guid, text_a=text_a, text_b=text_b, label=label))
+
+        self.test_examples = examples
+
+        def wrapper():
+            for (example_idx, example) in enumerate(examples):
+                yield 0, example_idx, example
+
+        return wrapper
+
+    def get_labels(self):
+        """See base class."""
+        return ["contradiction", "entailment", "neutral"]
+
+
+class MnliProcessor(DataProcessor):
+    """Processor for the MultiNLI data set (GLUE version)."""
+
+    def get_train_iter(self,
+                       data_dir,
+                       epoch_num=1,
+                       shuffle=True,
+                       shuffle_seed=None):
+        """See base class."""
+        examples = self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+        self.train_examples = examples
+
+        def wrapper():
+            if shuffle:
+                if shuffle_seed is not None:
+                    np.random.seed(shuffle_seed)
+            for epoch_idx in range(epoch_num):
+                if shuffle:
+                    np.random.shuffle(examples)
+                for (example_idx, example) in enumerate(examples):
+                    yield epoch_idx, example_idx, example
+
+        return wrapper
+
+    def get_dev_iter(self, data_dir):
+        """See base class."""
+        examples = self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
+            "dev_matched")
+
+        self.dev_examples = examples
+
+        def wrapper():
+            for (example_idx, example) in enumerate(examples):
+                yield 0, example_idx, example
+
+        return wrapper
+
+    def get_test_iter(self, data_dir):
+        """See base class."""
+        examples = self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test")
+
+        self.test_examples = examples
+
+        def wrapper():
+            for (example_idx, example) in enumerate(examples):
+                yield 0, example_idx, example
+
+        return wrapper
+
+    def get_labels(self):
+        """See base class."""
+        return ["contradiction", "entailment", "neutral"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type,
+                              tokenization.convert_to_unicode(line[0]))
+            text_a = tokenization.convert_to_unicode(line[8])
+            text_b = tokenization.convert_to_unicode(line[9])
+            if set_type == "test":
+                label = "contradiction"
+            else:
+                label = tokenization.convert_to_unicode(line[-1])
+            examples.append(
+                InputExample(
+                    guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class MrpcProcessor(DataProcessor):
+    """Processor for the MRPC data set (GLUE version)."""
+
+    def get_train_iter(self,
+                       data_dir,
+                       epoch_num=1,
+                       shuffle=True,
+                       shuffle_seed=None):
+        """See base class."""
+        examples = self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+        self.train_examples = examples
+
+        def wrapper():
+            if shuffle:
+                if shuffle_seed is not None:
+                    np.random.seed(shuffle_seed)
+            for epoch_idx in range(epoch_num):
+                if shuffle:
+                    np.random.shuffle(examples)
+                for (example_idx, example) in enumerate(examples):
+                    yield epoch_idx, example_idx, example
+
+        return wrapper
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        examples = self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+        self.dev_examples = examples
+
+        def wrapper():
+            for (example_idx, example) in enumerate(examples):
+                yield 0, example_idx, example
+
+        return wrapper
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        examples = self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+        self.test_examples = examples
+
+        def wrapper():
+            for (example_idx, example) in enumerate(examples):
+                yield 0, example_idx, example
+
+        return wrapper
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            text_a = tokenization.convert_to_unicode(line[3])
+            text_b = tokenization.convert_to_unicode(line[4])
+            if set_type == "test":
+                label = "0"
+            else:
+                label = tokenization.convert_to_unicode(line[0])
+            examples.append(
+                InputExample(
+                    guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class ColaProcessor(DataProcessor):
+    """Processor for the CoLA data set (GLUE version)."""
+
+    def get_train_iter(self,
+                       data_dir,
+                       epoch_num=1,
+                       shuffle=True,
+                       shuffle_seed=None):
+        """See base class."""
+        examples = self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+        self.train_examples = examples
+
+        def wrapper():
+            if shuffle:
+                if shuffle_seed is not None:
+                    np.random.seed(shuffle_seed)
+            for epoch_idx in range(epoch_num):
+                if shuffle:
+                    np.random.shuffle(examples)
+                for (example_idx, example) in enumerate(examples):
+                    yield epoch_idx, example_idx, example
+
+        return wrapper
+
+    def get_dev_iter(self, data_dir):
+        """See base class."""
+        examples = self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+        self.dev_examples = examples
+
+        def wrapper():
+            for (example_idx, example) in enumerate(examples):
+                yield 0, example_idx, example
+
+        return wrapper
+
+    def get_test_iter(self, data_dir):
+        """See base class."""
+        examples = self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+        self.test_examples = examples
+
+        def wrapper():
+            for (example_idx, example) in enumerate(examples):
+                yield 0, example_idx, example
+
+        return wrapper
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            # Only the test set has a header
+            if set_type == "test" and i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            if set_type == "test":
+                text_a = tokenization.convert_to_unicode(line[1])
+                label = "0"
+            else:
+                text_a = tokenization.convert_to_unicode(line[3])
+                label = tokenization.convert_to_unicode(line[1])
+            examples.append(
+                InputExample(
+                    guid=guid, text_a=text_a, text_b=None, label=label))
+        return examples
+
+
+def convert_single_example_to_unicode(guid, single_example):
+    text_a = tokenization.convert_to_unicode(single_example[0])
+    text_b = tokenization.convert_to_unicode(single_example[1])
+    label = tokenization.convert_to_unicode(single_example[2])
+    return InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
+
+
+def convert_single_example(ex_index, example, label_list, max_seq_length,
+                           tokenizer):
+    """Converts a single `InputExample` into a single `InputFeatures`."""
+    label_map = {}
+    for (i, label) in enumerate(label_list):
+        label_map[label] = i
+
+    tokens_a = tokenizer.tokenize(example.text_a)
+    tokens_b = None
+    if example.text_b:
+        tokens_b = tokenizer.tokenize(example.text_b)
+
+    if tokens_b:
+        # Modifies `tokens_a` and `tokens_b` in place so that the total
+        # length is less than the specified length.
+        # Account for [CLS], [SEP], [SEP] with "- 3"
+        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+    else:
+        # Account for [CLS] and [SEP] with "- 2"
+        if len(tokens_a) > max_seq_length - 2:
+            tokens_a = tokens_a[0:(max_seq_length - 2)]
+
+    # The convention in BERT is:
+    # (a) For sequence pairs:
+    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+    #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
+    # (b) For single sequences:
+    #  tokens:   [CLS] the dog is hairy . [SEP]
+    #  type_ids: 0     0   0   0  0     0 0
+    #
+    # Where "type_ids" are used to indicate whether this is the first
+    # sequence or the second sequence. The embedding vectors for `type=0` and
+    # `type=1` were learned during pre-training and are added to the wordpiece
+    # embedding vector (and position vector). This is not *strictly* necessary
+    # since the [SEP] token unambiguously separates the sequences, but it makes
+    # it easier for the model to learn the concept of sequences.
+    #
+    # For classification tasks, the first vector (corresponding to [CLS]) is
+    # used as as the "sentence vector". Note that this only makes sense because
+    # the entire model is fine-tuned.
+    tokens = []
+    segment_ids = []
+    tokens.append("[CLS]")
+    segment_ids.append(0)
+    for token in tokens_a:
+        tokens.append(token)
+        segment_ids.append(0)
+    tokens.append("[SEP]")
+    segment_ids.append(0)
+
+    if tokens_b:
+        for token in tokens_b:
+            tokens.append(token)
+            segment_ids.append(1)
+        tokens.append("[SEP]")
+        segment_ids.append(1)
+
+    input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+    # The mask has 1 for real tokens and 0 for padding tokens. Only real
+    # tokens are attended to.
+    input_mask = [1] * len(input_ids)
+
+    label_id = label_map[example.label]
+
+    feature = InputFeatures(
+        input_ids=input_ids,
+        input_mask=input_mask,
+        segment_ids=segment_ids,
+        label_id=label_id)
+    return feature
+
+
+def convert_examples_to_features(examples, label_list, max_seq_length,
+                                 tokenizer):
+    """Convert a set of `InputExample`s to a list of `InputFeatures`."""
+
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        if ex_index % 10000 == 0:
+            print("Writing example %d of %d" % (ex_index, len(examples)))
+
+        feature = convert_single_example(ex_index, example, label_list,
+                                         max_seq_length, tokenizer)
+
+        features.append(feature)
+    return features
+
+
+if __name__ == '__main__':
+    print("hello world")
+    pass
--- a/hapi/text/bert/dataloader.py
+++ b/hapi/text/bert/dataloader.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import io
+import os
+import six
+import csv
+import glob
+import tarfile
+import itertools
+from functools import partial
+
+import numpy as np
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.parallel import ParallelEnv
+from paddle.fluid.io import BatchSampler, DataLoader, Dataset
+from hapi.distributed import DistributedBatchSampler
+from hapi.text.bert.data_processor import DataProcessor, XnliProcessor, ColaProcessor, MrpcProcessor, MnliProcessor
+from hapi.text.bert.batching import prepare_batch_data
+import hapi.text.tokenizer.tokenization as tokenization
+
+__all__ = [
+    'BertInputExample', 'BertInputFeatures', 'SingleSentenceDataset',
+    'SentencePairDataset'
+]
+
+
+class BertInputExample(object):
+    def __init__(self, uid, text_a, text_b=None, label=None):
+        self.uid = uid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+
+
+class BertInputFeatures(object):
+    def __init__(self, input_ids, input_mask, segment_ids, label_id):
+        self.input_ids = input_ids
+        self.pos_ids = list(range(len(self.input_ids)))
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.label_id = label_id
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+
+def convert_single_example_to_unicode(guid, single_example):
+    text_a = tokenization.convert_to_unicode(single_example[0])
+    text_b = tokenization.convert_to_unicode(single_example[1])
+    label = tokenization.convert_to_unicode(single_example[2])
+    return BertInputExample(uid=uid, text_a=text_a, text_b=text_b, label=label)
+
+
+def convert_single_example(ex_index, example, label_list, max_seq_length,
+                           tokenizer):
+    """Converts a single `BertInputExample` into a single `BertInputFeatures`."""
+    label_map = {}
+    for (i, label) in enumerate(label_list):
+        label_map[label] = i
+
+    tokens_a = tokenizer.tokenize(example.text_a)
+    tokens_b = None
+    if example.text_b:
+        tokens_b = tokenizer.tokenize(example.text_b)
+
+    if tokens_b:
+        # Modifies `tokens_a` and `tokens_b` in place so that the total
+        # length is less than the specified length.
+        # Account for [CLS], [SEP], [SEP] with "- 3"
+        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+    else:
+        # Account for [CLS] and [SEP] with "- 2"
+        if len(tokens_a) > max_seq_length - 2:
+            tokens_a = tokens_a[0:(max_seq_length - 2)]
+
+    tokens = []
+    segment_ids = []
+    tokens.append("[CLS]")
+    segment_ids.append(0)
+    for token in tokens_a:
+        tokens.append(token)
+        segment_ids.append(0)
+    tokens.append("[SEP]")
+    segment_ids.append(0)
+
+    if tokens_b:
+        for token in tokens_b:
+            tokens.append(token)
+            segment_ids.append(1)
+        tokens.append("[SEP]")
+        segment_ids.append(1)
+
+    input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+    input_mask = [1] * len(input_ids)
+    label_id = label_map[example.label]
+
+    feature = BertInputFeatures(
+        input_ids=input_ids,
+        input_mask=input_mask,
+        segment_ids=segment_ids,
+        label_id=label_id)
+
+    return feature
+
+
+def convert_examples_to_features(examples, label_list, max_seq_length,
+                                 tokenizer):
+    """Convert a set of `InputExample`s to a list of `InputFeatures`."""
+
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        if ex_index % 10000 == 0:
+            print("Writing example %d of %d" % (ex_index, len(examples)))
+
+        feature = convert_single_example(ex_index, example, label_list,
+                                         max_seq_length, tokenizer)
+
+        features.append(feature)
+
+    return features
+
+
+def _read_tsv(input_file, delimiter="\t", quotechar=None):
+    """Reads a tab separated value file."""
+    with io.open(input_file, "r", encoding="utf8") as f:
+        reader = csv.reader(f, delimiter=delimiter, quotechar=quotechar)
+        lines = []
+        for line in reader:
+            lines.append(line)
+        return lines
+
+
+class SingleSentenceDataset(Dataset):
+    def __init__(self,
+                 tokenizer,
+                 label_list,
+                 max_seq_length,
+                 mode="all_in_memory"):
+
+        assert isinstance(mode,
+                          str), "mode of SingleSentenceDataset should be str"
+        assert mode in [
+            "all_in_memory", "leveldb"
+        ], "mode of SingleSentenceDataset should be in [all_in_memory, leveldb], but get" % mode
+
+        self.examples = []
+
+    def load_all_data_in_memory(self,
+                                input_file,
+                                label_list,
+                                max_seq_length,
+                                tokenizer,
+                                line_processor=None,
+                                delimiter="\t",
+                                quotechar=None):
+        lines = _read_tsv(input_file, delimiter=delimiter, quotechar=quotechar)
+
+        def default_line_processor(line_id, line):
+            assert len(line) == 2
+            text_a = line[0]
+            label = line[1]
+
+            return BertInputExample(
+                str(line_id), text_a=text_a, text_b=None, label=label)
+
+        if line_processor is None:
+            line_processor = default_line_processor
+
+        for (line_id, line) in enumerate(lines):
+            input_example = line_processor(line_id, line)
+            if not input_example:
+                continue
+            input_feature = convert_single_example(
+                str(line_id), input_example, label_list, max_seq_length,
+                tokenizer)
+            self.examples.append(input_feature)
+
+    def __getitem__(self, idx):
+        return self.examples[idx].input_ids, self.examples[
+            idx].pos_ids, self.examples[idx].segment_ids, self.examples[
+                idx].label_id
+
+    def __len__(self):
+        return len(self.examples)
+
+
+class SentencePairDataset(Dataset):
+    def __init__(self,
+                 tokenizer,
+                 label_ist,
+                 max_seq_length,
+                 mode="all_in_memory"):
+
+        assert isinstance(mode,
+                          str), "mode of SentencePairDataset should be str"
+        assert mode in [
+            "all_in_memory", "leveldb"
+        ], "mode of SentencePairDataset should be in [all_in_memory, leveldb], but get" % mode
+
+        self.examples = []
+
+    def load_all_data_in_memory(self,
+                                input_file,
+                                label_list,
+                                max_seq_length,
+                                tokenizer,
+                                line_processor=None,
+                                delimiter="\t",
+                                quotechar=None):
+        lines = _read_tsv(input_file, delimiter=delimiter, quotechar=quotechar)
+
+        def default_line_processor(line_id, line):
+            assert len(line) == 3
+            text_a = line[0]
+            text_b = line[1]
+            label = line[2]
+
+            return BertInputExample(
+                str(line_id), text_a=text_a, text_b=text_b, label=label)
+
+        if line_processor is None:
+            line_processor = default_line_processor
+
+        for (line_id, line) in enumerate(lines):
+            input_example = line_processor(line_id, line)
+            if not input_example:
+                continue
+            input_feature = convert_single_example(
+                str(line_id), input_example, label_list, max_seq_length,
+                tokenizer)
+            self.examples.append(input_feature)
+
+    def __getitem__(self, idx):
+        return self.examples[idx].input_ids, self.examples[
+            idx].pos_ids, self.examples[idx].segment_ids, self.examples[
+                idx].label_id
+
+    def __len__(self):
+        return len(self.examples)
+
+
+def _prepare_train_batch(insts,
+                         vocab_size=0,
+                         pad_id=None,
+                         cls_id=None,
+                         sep_id=None,
+                         mask_id=-1,
+                         return_input_mask=True,
+                         return_max_len=True,
+                         return_num_token=False):
+
+    return prepare_batch_data(
+        insts,
+        0,
+        voc_size=vocab_size,
+        pad_id=pad_id,
+        cls_id=cls_id,
+        sep_id=sep_id,
+        mask_id=mask_id,
+        return_input_mask=return_input_mask,
+        return_max_len=return_max_len,
+        return_num_token=return_num_token)
+
+
+class SingleSentenceDataLoader(object):
+    def __init__(self,
+                 input_file,
+                 tokenizer,
+                 label_list,
+                 max_seq_length,
+                 batch_size,
+                 shuffle=False,
+                 drop_last=False,
+                 mode="all_in_memory",
+                 line_processor=None,
+                 delimiter="\t",
+                 quotechar=None,
+                 device=fluid.CPUPlace(),
+                 num_workers=0,
+                 return_list=True):
+
+        self.dataset = SingleSentenceDataset(tokenizer, label_list,
+                                             max_seq_length, mode)
+
+        if mode == "all_in_memory":
+            self.dataset.load_all_data_in_memory(
+                input_file, label_list, max_seq_length, tokenizer,
+                line_processor, delimiter, quotechar)
+        elif mode == "leveldb":
+            #TODO add leveldb reader
+            pass
+        else:
+            raise ValueError("mode should be in [all_in_memory, leveldb]")
+
+        self.sampler = DistributedBatchSampler(
+            self.dataset, batch_size, shuffle=shuffle, drop_last=drop_last)
+
+        self.dataloader = DataLoader(
+            dataset=self.dataset,
+            batch_sampler=self.sampler,
+            places=device,
+            collate_fn=partial(
+                _prepare_train_batch,
+                vocab_size=-1,
+                pad_id=tokenizer.vocab["[PAD]"],
+                cls_id=tokenizer.vocab["[CLS]"],
+                sep_id=tokenizer.vocab["[SEP]"],
+                mask_id=-1,
+                return_input_mask=True,
+                return_max_len=False,
+                return_num_token=False),
+            num_workers=num_workers,
+            return_list=return_list)
+
+
+if __name__ == "__main__":
+    print("hello world.")
--- a/hapi/text/bert/optimization.py
+++ b/hapi/text/bert/optimization.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Optimization and learning rate scheduling."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle.fluid as fluid
+
+from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
+
+
+class ConstantLR(LearningRateDecay):
+    def __init__(self, learning_rate, begin=0, step=1, dtype='float32'):
+        super(ConstantLR, self).__init__(begin, step, dtype)
+        self.learning_rate = learning_rate
+
+    def step(self):
+        return self.learning_rate
+
+
+class LinearDecay(LearningRateDecay):
+    def __init__(self,
+                 learning_rate,
+                 warmup_steps,
+                 decay_steps,
+                 end_learning_rate=0.0001,
+                 power=1.0,
+                 cycle=False,
+                 begin=0,
+                 step=1,
+                 dtype='float32'):
+        super(LinearDecay, self).__init__(begin, step, dtype)
+        self.learning_rate = learning_rate
+        self.warmup_steps = warmup_steps
+        self.decay_steps = decay_steps
+        self.end_learning_rate = end_learning_rate
+        self.power = power
+        self.cycle = cycle
+
+    def step(self):
+        if self.step_num < self.warmup_steps:
+            decayed_lr = self.learning_rate * (self.step_num /
+                                               self.warmup_steps)
+            decayed_lr = self.create_lr_var(decayed_lr)
+        else:
+            tmp_step_num = self.step_num
+            tmp_decay_steps = self.decay_steps
+            if self.cycle:
+                div_res = fluid.layers.ceil(
+                    self.create_lr_var(tmp_step_num / float(self.decay_steps)))
+                if tmp_step_num == 0:
+                    div_res = self.create_lr_var(1.0)
+                tmp_decay_steps = self.decay_steps * div_res
+            else:
+                tmp_step_num = self.create_lr_var(
+                    tmp_step_num
+                    if tmp_step_num < self.decay_steps else self.decay_steps)
+                decayed_lr = (self.learning_rate - self.end_learning_rate) * \
+                    ((1 - tmp_step_num / tmp_decay_steps) ** self.power) + self.end_learning_rate
+
+        return decayed_lr
+
+
+class Optimizer(object):
+    def __init__(self,
+                 warmup_steps,
+                 num_train_steps,
+                 learning_rate,
+                 model_cls,
+                 weight_decay,
+                 scheduler='linear_warmup_decay',
+                 loss_scaling=1.0,
+                 parameter_list=None):
+        self.warmup_steps = warmup_steps
+        self.num_train_steps = num_train_steps
+        self.learning_rate = learning_rate
+        self.model_cls = model_cls
+        self.weight_decay = weight_decay
+        self.scheduler = scheduler
+        self.loss_scaling = loss_scaling
+        self.parameter_list = parameter_list
+
+        self.scheduled_lr = 0.0
+        self.optimizer = self.lr_schedule()
+
+    def lr_schedule(self):
+        if self.warmup_steps > 0:
+            if self.scheduler == 'noam_decay':
+                self.scheduled_lr = fluid.dygraph.NoamDecay(1 / (
+                    self.warmup_steps * (self.learning_rate**2)),
+                                                            self.warmup_steps)
+            elif self.scheduler == 'linear_warmup_decay':
+                self.scheduled_lr = LinearDecay(self.learning_rate,
+                                                self.warmup_steps,
+                                                self.num_train_steps, 0.0)
+            else:
+                raise ValueError("Unkown learning rate scheduler, should be "
+                                 "'noam_decay' or 'linear_warmup_decay'")
+            optimizer = fluid.optimizer.Adam(
+                learning_rate=self.scheduled_lr,
+                parameter_list=self.parameter_list)
+        else:
+            self.scheduled_lr = ConstantLR(self.learning_rate)
+            optimizer = fluid.optimizer.Adam(
+                learning_rate=self.scheduled_lr,
+                parameter_list=self.parameter_list)
+
+        return optimizer
+
+    def exclude_from_weight_decay(self, name):
+        if name.find("layer_norm") > -1:
+            return True
+        bias_suffix = ["_bias", "_b", ".b_0"]
+        for suffix in bias_suffix:
+            if name.endswith(suffix):
+                return True
+        return False
+
+    def minimize(self, loss, use_data_parallel=False, model=None):
+        param_list = dict()
+
+        clip_norm_thres = 1.0
+        #grad_clip = fluid.clip.GradientClipByGlobalNorm(clip_norm_thres)
+
+        if use_data_parallel:
+            loss = model.scale_loss(loss)
+
+        loss.backward()
+
+        if self.weight_decay > 0:
+            for param in self.model_cls.parameters():
+                param_list[param.name] = param * 1.0
+                param_list[param.name].stop_gradient = True
+
+        if use_data_parallel:
+            assert model is not None
+            model.apply_collective_grads()
+
+        #_, param_grads = self.optimizer.minimize(loss, grad_clip=grad_clip)
+        _, param_grads = self.optimizer.minimize(loss)
+
+        if self.weight_decay > 0:
+            for param, grad in param_grads:
+                if self.exclude_from_weight_decay(param.name):
+                    continue
+                if isinstance(self.scheduled_lr.step(), float):
+                    updated_param = param.numpy() - param_list[
+                        param.name].numpy(
+                        ) * self.weight_decay * self.scheduled_lr.step()
+                else:
+                    updated_param = param.numpy(
+                    ) - param_list[param.name].numpy(
+                    ) * self.weight_decay * self.scheduled_lr.step().numpy()
+                updated_param_var = fluid.dygraph.to_variable(updated_param)
+                param = updated_param_var
+                #param = fluid.layers.reshape(x=updated_param_var, shape=list(updated_param_var.shape))
--- a/hapi/text/bert/static_optimization.py
+++ b/hapi/text/bert/static_optimization.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Optimization and learning rate scheduling."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle.fluid as fluid
+from utils.fp16 import create_master_params_grads, master_param_to_train_param, apply_dynamic_loss_scaling
+
+
+def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps):
+    """ Applies linear warmup of learning rate from 0 and decay to 0."""
+    with fluid.default_main_program()._lr_schedule_guard():
+        lr = fluid.layers.tensor.create_global_var(
+            shape=[1],
+            value=0.0,
+            dtype='float32',
+            persistable=True,
+            name="scheduled_learning_rate")
+
+        global_step = fluid.layers.learning_rate_scheduler._decay_step_counter(
+        )
+
+        with fluid.layers.control_flow.Switch() as switch:
+            with switch.case(global_step < warmup_steps):
+                warmup_lr = learning_rate * (global_step / warmup_steps)
+                fluid.layers.tensor.assign(warmup_lr, lr)
+            with switch.default():
+                decayed_lr = fluid.layers.learning_rate_scheduler.polynomial_decay(
+                    learning_rate=learning_rate,
+                    decay_steps=num_train_steps,
+                    end_learning_rate=0.0,
+                    power=1.0,
+                    cycle=False)
+                fluid.layers.tensor.assign(decayed_lr, lr)
+
+        return lr
+
+
+def optimization(loss,
+                 warmup_steps,
+                 num_train_steps,
+                 learning_rate,
+                 train_program,
+                 startup_prog,
+                 weight_decay,
+                 scheduler='linear_warmup_decay',
+                 use_fp16=False,
+                 use_dynamic_loss_scaling=False,
+                 init_loss_scaling=1.0,
+                 incr_every_n_steps=1000,
+                 decr_every_n_nan_or_inf=2,
+                 incr_ratio=2.0,
+                 decr_ratio=0.8):
+
+    scheduled_lr, loss_scaling = None, None
+    if scheduler == 'noam_decay':
+        if warmup_steps > 0:
+            scheduled_lr = fluid.layers.learning_rate_scheduler\
+             .noam_decay(1/(warmup_steps *(learning_rate ** 2)),
+           warmup_steps)
+        else:
+            print(
+                "WARNING: noam decay of learning rate should have postive warmup "
+                "steps but given {}, using constant learning rate instead!"
+                .format(warmup_steps))
+            scheduled_lr = fluid.layers.create_global_var(
+                name=fluid.unique_name.generate("learning_rate"),
+                shape=[1],
+                value=learning_rate,
+                dtype='float32',
+                persistable=True)
+    elif scheduler == 'linear_warmup_decay':
+        if warmup_steps > 0:
+            scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps,
+                                               num_train_steps)
+        else:
+            print(
+                "WARNING: linear warmup decay of learning rate should have "
+                "postive warmup steps but given {}, use constant learning rate "
+                "instead!".format(warmup_steps))
+            scheduled_lr = fluid.layers.create_global_var(
+                name=fluid.unique_name.generate("learning_rate"),
+                shape=[1],
+                value=learning_rate,
+                dtype='float32',
+                persistable=True)
+    else:
+        raise ValueError("Unkown learning rate scheduler, should be "
+                         "'noam_decay' or 'linear_warmup_decay'")
+
+    optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
+    fluid.clip.set_gradient_clip(
+        clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))
+
+    def exclude_from_weight_decay(param):
+        name = param.name.rstrip(".master")
+        if name.find("layer_norm") > -1:
+            return True
+        bias_suffix = ["_bias", "_b", ".b_0"]
+        for suffix in bias_suffix:
+            if name.endswith(suffix):
+                return True
+        return False
+
+    param_list = dict()
+
+    if use_fp16:
+        loss_scaling = fluid.layers.create_global_var(
+            name=fluid.unique_name.generate("loss_scaling"),
+            shape=[1],
+            value=init_loss_scaling,
+            dtype='float32',
+            persistable=True)
+        loss *= loss_scaling
+
+        param_grads = optimizer.backward(loss)
+        master_param_grads = create_master_params_grads(
+            param_grads, train_program, startup_prog, loss_scaling)
+
+        if weight_decay > 0:
+            for param, _ in master_param_grads:
+                param_list[param.name] = param * 1.0
+                param_list[param.name].stop_gradient = True
+
+        if use_dynamic_loss_scaling:
+            apply_dynamic_loss_scaling(
+                loss_scaling, master_param_grads, incr_every_n_steps,
+                decr_every_n_nan_or_inf, incr_ratio, decr_ratio)
+
+        optimizer.apply_gradients(master_param_grads)
+
+        if weight_decay > 0:
+            for param, grad in master_param_grads:
+                if exclude_from_weight_decay(param):
+                    continue
+                with param.block.program._optimized_guard(
+                    [param, grad]), fluid.framework.name_scope("weight_decay"):
+                    updated_param = param - param_list[
+                        param.name] * weight_decay * scheduled_lr
+                    fluid.layers.assign(output=param, input=updated_param)
+
+        master_param_to_train_param(master_param_grads, param_grads,
+                                    train_program)
+
+    else:
+        if weight_decay > 0:
+            for param in train_program.all_parameters():
+                param_list[param.name] = param * 1.0
+                param_list[param.name].stop_gradient = True
+
+        _, param_grads = optimizer.minimize(loss)
+
+        if weight_decay > 0:
+            for param, grad in param_grads:
+                if exclude_from_weight_decay(param):
+                    continue
+                with param.block.program._optimized_guard(
+                    [param, grad]), fluid.framework.name_scope("weight_decay"):
+                    updated_param = param - param_list[
+                        param.name] * weight_decay * scheduled_lr
+                    fluid.layers.assign(output=param, input=updated_param)
+
+    return scheduled_lr, loss_scaling
--- a/hapi/text/bert/utils/__init__.py
+++ b/hapi/text/bert/utils/__init__.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from hapi.text.bert.utils.args import str2bool as str2bool
+from hapi.text.bert.utils.args import ArgumentGroup as ArgumentGroup
+from hapi.text.bert.utils.args import print_arguments as print_arguments
+from hapi.text.bert.utils.args import check_cuda as check_cuda
+
+from hapi.text.bert.utils.cards import get_cards as get_cards
+
+from hapi.text.bert.utils.fp16 import cast_fp16_to_fp32 as cast_fp16_to_fp32
+from hapi.text.bert.utils.fp16 import cast_fp32_to_fp16 as cast_fp32_to_fp16
+from hapi.text.bert.utils.fp16 import copy_to_master_param as copy_to_master_param
+from hapi.text.bert.utils.fp16 import create_master_params_grads as create_master_params_grads
+from hapi.text.bert.utils.fp16 import master_param_to_train_param as master_param_to_train_param
+
+from hapi.text.bert.utils.init import init_checkpoint as init_checkpoint
+from hapi.text.bert.utils.init import init_pretraining_params as init_pretraining_params
+from hapi.text.bert.utils.init import init_from_static_model as init_from_static_model
--- a/hapi/text/bert/utils/args.py
+++ b/hapi/text/bert/utils/args.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Arguments for configuration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+import argparse
+
+import paddle.fluid as fluid
+
+
+def str2bool(v):
+    # because argparse does not support to parse "true, False" as python
+    # boolean directly
+    return v.lower() in ("true", "t", "1")
+
+
+class ArgumentGroup(object):
+    def __init__(self, parser, title, des):
+        self._group = parser.add_argument_group(title=title, description=des)
+
+    def add_arg(self, name, type, default, help, **kwargs):
+        type = str2bool if type == bool else type
+        self._group.add_argument(
+            "--" + name,
+            default=default,
+            type=type,
+            help=help + ' Default: %(default)s.',
+            **kwargs)
+
+
+def print_arguments(args):
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(six.iteritems(vars(args))):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+def check_cuda(use_cuda, err = \
+    "\nYou can not set use_cuda = True in the model because you are using paddlepaddle-cpu.\n \
+    Please: 1. Install paddlepaddle-gpu to run your models on GPU or 2. Set use_cuda = False to run models on CPU.\n"
+                                                                                                                     ):
+    try:
+        if use_cuda == True and fluid.is_compiled_with_cuda() == False:
+            print(err)
+            sys.exit(1)
+    except Exception as e:
+        pass
--- a/hapi/text/bert/utils/cards.py
+++ b/hapi/text/bert/utils/cards.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+
+def get_cards():
+    """
+    get gpu cards number
+    """
+    num = 0
+    cards = os.environ.get('CUDA_VISIBLE_DEVICES', '')
+    if cards != '':
+        num = len(cards.split(","))
+    return num
--- a/hapi/text/bert/utils/convert_static_to_dygraph.py
+++ b/hapi/text/bert/utils/convert_static_to_dygraph.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import shutil
+import sys
+import os
+
+
+def usage():
+    """
+    usage information
+    """
+    print
+    print("please use command: ")
+    print(
+        "python convert_static_to_dygraph.py input_params_dir output_params_dir"
+    )
+    print
+
+
+def convert_static_to_dygraph(static_model_path, dygraph_model_path):
+    """
+    convert paddle static bert model to dygraph model 
+    """
+
+    def mkdir(path):
+        if not os.path.isdir(path):
+            if os.path.split(path)[0]:
+                mkdir(os.path.split(path)[0])
+        else:
+            return
+        os.mkdir(path)
+
+    if os.path.exists(dygraph_model_path):
+        shutil.rmtree(dygraph_model_path)
+    mkdir(dygraph_model_path)
+
+    if not os.path.exists(static_model_path):
+        print("paddle static model path doesn't exist.....")
+        return -1
+
+    file_list = []
+    for root, dirs, files in os.walk(static_model_path):
+        file_list.extend(files)
+
+    os.makedirs(os.path.join(dygraph_model_path, "PretrainModelLayer_0"))
+    os.makedirs(
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/BertModelLayer_0"))
+    os.makedirs(
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/PrePostProcessLayer_0"))
+    os.makedirs(
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/BertModelLayer_0/PrePostProcessLayer_0"))
+
+    #os.chdir(static_model_path)
+    #convert embedding file
+    embedding_type = ["word", "pos", "sent"]
+    for i in range(3):
+        src_name = embedding_type[i] + "_embedding"
+        trg_name = "Embedding_" + str(i) + "." + src_name
+        shutil.copyfile(
+            os.path.join(static_model_path, src_name),
+            os.path.join(dygraph_model_path,
+                         "PretrainModelLayer_0/BertModelLayer_0/" + trg_name))
+
+    #convert pre_encoder file
+    shutil.copyfile(
+        os.path.join(static_model_path, "pre_encoder_layer_norm_scale"),
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/BertModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_scale"
+        ))
+    shutil.copyfile(
+        os.path.join(static_model_path, "pre_encoder_layer_norm_bias"),
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/BertModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_bias"
+        ))
+
+    #convert mask lm params file
+    shutil.copyfile(
+        os.path.join(static_model_path, "mask_lm_out_fc.b_0"),
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/Layer_0.mask_lm_out_fc.b_0"))
+    shutil.copyfile(
+        os.path.join(static_model_path, "mask_lm_trans_fc.b_0"),
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/FC_0.mask_lm_trans_fc.b_0"))
+    shutil.copyfile(
+        os.path.join(static_model_path, "mask_lm_trans_fc.w_0"),
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/FC_0.mask_lm_trans_fc.w_0"))
+    shutil.copyfile(
+        os.path.join(static_model_path, "mask_lm_trans_layer_norm_bias"),
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_bias"
+        ))
+    shutil.copyfile(
+        os.path.join(static_model_path, "mask_lm_trans_layer_norm_scale"),
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_scale"
+        ))
+    shutil.copyfile(
+        os.path.join(static_model_path, "next_sent_fc.b_0"),
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/FC_1.next_sent_fc.b_0"))
+    shutil.copyfile(
+        os.path.join(static_model_path, "next_sent_fc.w_0"),
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/FC_1.next_sent_fc.w_0"))
+    shutil.copyfile(
+        os.path.join(static_model_path, "pooled_fc.b_0"),
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/BertModelLayer_0/FC_0.pooled_fc.b_0"))
+    shutil.copyfile(
+        os.path.join(static_model_path, "pooled_fc.w_0"),
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/BertModelLayer_0/FC_0.pooled_fc.w_0"))
+
+    encoder_num = 0
+    for f in file_list:
+        if not f.startswith("encoder_layer"):
+            continue
+        layer_num = f.split('_')[2]
+        if int(layer_num) > encoder_num:
+            encoder_num = int(layer_num)
+
+    encoder_num += 1
+    for i in range(encoder_num):
+        encoder_dir = "EncoderSubLayer_" + str(i)
+        os.makedirs(
+            os.path.join(dygraph_model_path,
+                         "PretrainModelLayer_0/BertModelLayer_0/" +
+                         "EncoderLayer_0/", encoder_dir))
+        os.makedirs(
+            os.path.join(dygraph_model_path,
+                         "PretrainModelLayer_0/BertModelLayer_0/" +
+                         "EncoderLayer_0/", encoder_dir +
+                         "/PositionwiseFeedForwardLayer_0"))
+        os.makedirs(
+            os.path.join(
+                dygraph_model_path, "PretrainModelLayer_0/BertModelLayer_0/" +
+                "EncoderLayer_0/", encoder_dir + "/MultiHeadAttentionLayer_0"))
+        os.makedirs(
+            os.path.join(
+                dygraph_model_path, "PretrainModelLayer_0/BertModelLayer_0/" +
+                "EncoderLayer_0/", encoder_dir + "/PrePostProcessLayer_1"))
+        os.makedirs(
+            os.path.join(
+                dygraph_model_path, "PretrainModelLayer_0/BertModelLayer_0/" +
+                "EncoderLayer_0/", encoder_dir + "/PrePostProcessLayer_3"))
+
+    encoder_map_dict = {
+        "ffn_fc_0.b_0":
+        ("PositionwiseFeedForwardLayer_0", "FC_0.ffn_fc_0.b_0"),
+        "ffn_fc_0.w_0":
+        ("PositionwiseFeedForwardLayer_0", "FC_0.ffn_fc_0.w_0"),
+        "ffn_fc_1.b_0":
+        ("PositionwiseFeedForwardLayer_0", "FC_1.ffn_fc_1.b_0"),
+        "ffn_fc_1.w_0":
+        ("PositionwiseFeedForwardLayer_0", "FC_1.ffn_fc_1.w_0"),
+        "multi_head_att_key_fc.b_0":
+        ("MultiHeadAttentionLayer_0", "FC_1.key_fc.b_0"),
+        "multi_head_att_key_fc.w_0":
+        ("MultiHeadAttentionLayer_0", "FC_1.key_fc.w_0"),
+        "multi_head_att_output_fc.b_0":
+        ("MultiHeadAttentionLayer_0", "FC_3.output_fc.b_0"),
+        "multi_head_att_output_fc.w_0":
+        ("MultiHeadAttentionLayer_0", "FC_3.output_fc.w_0"),
+        "multi_head_att_query_fc.b_0":
+        ("MultiHeadAttentionLayer_0", "FC_0.query_fc.b_0"),
+        "multi_head_att_query_fc.w_0":
+        ("MultiHeadAttentionLayer_0", "FC_0.query_fc.w_0"),
+        "multi_head_att_value_fc.b_0":
+        ("MultiHeadAttentionLayer_0", "FC_2.value_fc.b_0"),
+        "multi_head_att_value_fc.w_0":
+        ("MultiHeadAttentionLayer_0", "FC_2.value_fc.w_0"),
+        "post_att_layer_norm_bias":
+        ("PrePostProcessLayer_1", "LayerNorm_0.post_att_layer_norm_bias"),
+        "post_att_layer_norm_scale":
+        ("PrePostProcessLayer_1", "LayerNorm_0.post_att_layer_norm_scale"),
+        "post_ffn_layer_norm_bias":
+        ("PrePostProcessLayer_3", "LayerNorm_0.post_ffn_layer_norm_bias"),
+        "post_ffn_layer_norm_scale":
+        ("PrePostProcessLayer_3", "LayerNorm_0.post_ffn_layer_norm_scale")
+    }
+
+    for f in file_list:
+        if not f.startswith("encoder_layer"):
+            continue
+        layer_num = f.split('_')[2]
+        suffix_name = "_".join(f.split('_')[3:])
+        in_dir = encoder_map_dict[suffix_name][0]
+        rename = encoder_map_dict[suffix_name][1]
+        encoder_layer = "EncoderSubLayer_" + layer_num
+        shutil.copyfile(
+            os.path.join(static_model_path, f),
+            os.path.join(
+                dygraph_model_path,
+                "PretrainModelLayer_0/BertModelLayer_0/EncoderLayer_0/" +
+                encoder_layer + "/" + in_dir + "/" + rename))
+
+
+if __name__ == "__main__":
+
+    if len(sys.argv) < 3:
+        usage()
+        exit(1)
+    static_model_path = sys.argv[1]
+    dygraph_model_path = sys.argv[2]
+    convert_static_to_dygraph(static_model_path, dygraph_model_path)
--- a/hapi/text/bert/utils/fp16.py
+++ b/hapi/text/bert/utils/fp16.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+
+
+def cast_fp16_to_fp32(i, o, prog):
+    prog.global_block().append_op(
+        type="cast",
+        inputs={"X": i},
+        outputs={"Out": o},
+        attrs={
+            "in_dtype": fluid.core.VarDesc.VarType.FP16,
+            "out_dtype": fluid.core.VarDesc.VarType.FP32
+        })
+
+
+def cast_fp32_to_fp16(i, o, prog):
+    prog.global_block().append_op(
+        type="cast",
+        inputs={"X": i},
+        outputs={"Out": o},
+        attrs={
+            "in_dtype": fluid.core.VarDesc.VarType.FP32,
+            "out_dtype": fluid.core.VarDesc.VarType.FP16
+        })
+
+
+def copy_to_master_param(p, block):
+    v = block.vars.get(p.name, None)
+    if v is None:
+        raise ValueError("no param name %s found!" % p.name)
+    new_p = fluid.framework.Parameter(
+        block=block,
+        shape=v.shape,
+        dtype=fluid.core.VarDesc.VarType.FP32,
+        type=v.type,
+        lod_level=v.lod_level,
+        stop_gradient=p.stop_gradient,
+        trainable=p.trainable,
+        optimize_attr=p.optimize_attr,
+        regularizer=p.regularizer,
+        gradient_clip_attr=p.gradient_clip_attr,
+        error_clip=p.error_clip,
+        name=v.name + ".master")
+    return new_p
+
+
+def create_master_params_grads(params_grads, main_prog, startup_prog,
+                               loss_scaling):
+    master_params_grads = []
+    tmp_role = main_prog._current_role
+    OpRole = fluid.core.op_proto_and_checker_maker.OpRole
+    main_prog._current_role = OpRole.Backward
+    for p, g in params_grads:
+        # create master parameters
+        master_param = copy_to_master_param(p, main_prog.global_block())
+        startup_master_param = startup_prog.global_block()._clone_variable(
+            master_param)
+        startup_p = startup_prog.global_block().var(p.name)
+        cast_fp16_to_fp32(startup_p, startup_master_param, startup_prog)
+        # cast fp16 gradients to fp32 before apply gradients
+        if g.name.find("layer_norm") > -1:
+            if loss_scaling > 1:
+                scaled_g = g / float(loss_scaling)
+            else:
+                scaled_g = g
+            master_params_grads.append([p, scaled_g])
+            continue
+        master_grad = fluid.layers.cast(g, "float32")
+        if loss_scaling > 1:
+            master_grad = master_grad / float(loss_scaling)
+        master_params_grads.append([master_param, master_grad])
+    main_prog._current_role = tmp_role
+    return master_params_grads
+
+
+def master_param_to_train_param(master_params_grads, params_grads, main_prog):
+    for idx, m_p_g in enumerate(master_params_grads):
+        train_p, _ = params_grads[idx]
+        if train_p.name.find("layer_norm") > -1:
+            continue
+        with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]):
+            cast_fp32_to_fp16(m_p_g[0], train_p, main_prog)
--- a/hapi/text/bert/utils/init.py
+++ b/hapi/text/bert/utils/init.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import six
+import ast
+import copy
+
+import numpy as np
+import paddle.fluid as fluid
+
+
+def cast_fp32_to_fp16(exe, main_program):
+    print("Cast parameters to float16 data format.")
+    for param in main_program.global_block().all_parameters():
+        if not param.name.endswith(".master"):
+            param_t = fluid.global_scope().find_var(param.name).get_tensor()
+            data = np.array(param_t)
+            if param.name.find("layer_norm") == -1:
+                param_t.set(np.float16(data).view(np.uint16), exe.place)
+            master_param_var = fluid.global_scope().find_var(param.name +
+                                                             ".master")
+            if master_param_var is not None:
+                master_param_var.get_tensor().set(data, exe.place)
+
+
+def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False):
+    assert os.path.exists(
+        init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path
+
+    def existed_persitables(var):
+        if not fluid.io.is_persistable(var):
+            return False
+        return os.path.exists(os.path.join(init_checkpoint_path, var.name))
+
+    fluid.io.load_vars(
+        exe,
+        init_checkpoint_path,
+        main_program=main_program,
+        predicate=existed_persitables)
+    print("Load model from {}".format(init_checkpoint_path))
+
+    if use_fp16:
+        cast_fp32_to_fp16(exe, main_program)
+
+
+def init_pretraining_params(exe,
+                            pretraining_params_path,
+                            main_program,
+                            use_fp16=False):
+    assert os.path.exists(pretraining_params_path
+                          ), "[%s] cann't be found." % pretraining_params_path
+
+    def existed_params(var):
+        if not isinstance(var, fluid.framework.Parameter):
+            return False
+        return os.path.exists(os.path.join(pretraining_params_path, var.name))
+
+    fluid.io.load_vars(
+        exe,
+        pretraining_params_path,
+        main_program=main_program,
+        predicate=existed_params)
+    print("Load pretraining parameters from {}.".format(
+        pretraining_params_path))
+
+    if use_fp16:
+        cast_fp32_to_fp16(exe, main_program)
+
+
+def init_from_static_model(dir_path,
+                           backbone_model,
+                           bert_config,
+                           verbose=False):
+    def load_numpy_weight(file_name):
+        if six.PY2:
+            res = np.load(os.path.join(dir_path, file_name), allow_pickle=True)
+        else:
+            res = np.load(
+                os.path.join(dir_path, file_name),
+                allow_pickle=True,
+                encoding='latin1')
+        assert res is not None
+        return res
+
+    # load word embedding
+    _param = load_numpy_weight("word_embedding")
+    backbone_model._src_emb.set_dict({"weight": _param})
+    if verbose:
+        print("INIT word embedding")
+
+    _param = load_numpy_weight("pos_embedding")
+    backbone_model._pos_emb.set_dict({"weight": _param})
+    if verbose:
+        print("INIT pos embedding")
+
+    _param = load_numpy_weight("sent_embedding")
+    backbone_model._sent_emb.set_dict({"weight": _param})
+    if verbose:
+        print("INIT sent embedding")
+
+    _param0 = load_numpy_weight("pooled_fc.w_0")
+    _param1 = load_numpy_weight("pooled_fc.b_0")
+    backbone_model.pooled_fc.set_dict({"weight": _param0, "bias": _param1})
+    if verbose:
+        print("INIT pooled_fc")
+
+    _param0 = load_numpy_weight("pre_encoder_layer_norm_scale")
+    _param1 = load_numpy_weight("pre_encoder_layer_norm_bias")
+    backbone_model.pre_process_layer._sub_layers["layer_norm_0"].set_dict({
+        "weight": _param0,
+        "bias": _param1
+    })
+    if verbose:
+        print("INIT pre_encoder layer norm")
+
+    for _i in range(bert_config["num_hidden_layers"]):
+        _param_weight = "encoder_layer_%d_multi_head_att_query_fc.w_0" % _i
+        _param_bias = "encoder_layer_%d_multi_head_att_query_fc.b_0" % _i
+
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+
+        backbone_model._encoder._sub_layers["layer_%d" %
+                                            _i].self_attn.q_fc.set_dict({
+                                                "weight": _param_weight,
+                                                "bias": _param_bias
+                                            })
+        if verbose:
+            print("INIT multi_head_att_query_fc %d" % _i)
+
+        _param_weight = "encoder_layer_%d_multi_head_att_key_fc.w_0" % _i
+        _param_bias = "encoder_layer_%d_multi_head_att_key_fc.b_0" % _i
+
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+
+        backbone_model._encoder._sub_layers["layer_%d" %
+                                            _i].self_attn.k_fc.set_dict({
+                                                "weight": _param_weight,
+                                                "bias": _param_bias
+                                            })
+        if verbose:
+            print("INIT multi_head_att_key_fc %d" % _i)
+
+        _param_weight = "encoder_layer_%d_multi_head_att_value_fc.w_0" % _i
+        _param_bias = "encoder_layer_%d_multi_head_att_value_fc.b_0" % _i
+
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+
+        backbone_model._encoder._sub_layers["layer_%d" %
+                                            _i].self_attn.v_fc.set_dict({
+                                                "weight": _param_weight,
+                                                "bias": _param_bias
+                                            })
+        if verbose:
+            print("INIT multi_head_att_value_fc %d" % _i)
+
+        # init output fc
+        _param_weight = "encoder_layer_%d_multi_head_att_output_fc.w_0" % _i
+        _param_bias = "encoder_layer_%d_multi_head_att_output_fc.b_0" % _i
+
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+
+        backbone_model._encoder._sub_layers["layer_%d" %
+                                            _i].self_attn.proj_fc.set_dict({
+                                                "weight": _param_weight,
+                                                "bias": _param_bias
+                                            })
+        if verbose:
+            print("INIT multi_head_att_output_fc %d" % _i)
+
+        # init layer_norm 1
+        _param_weight = "encoder_layer_%d_post_att_layer_norm_scale" % _i
+        _param_bias = "encoder_layer_%d_post_att_layer_norm_bias" % _i
+
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+
+        backbone_model._encoder._sub_layers[
+            "layer_%d" % _i].postprocesser1.layer_norm_0.set_dict({
+                "weight": _param_weight,
+                "bias": _param_bias
+            })
+        if verbose:
+            print("INIT layer norm in attention at %d layer" % _i)
+
+        # init layer_norm 2
+        _param_weight = "encoder_layer_%d_post_ffn_layer_norm_scale" % _i
+        _param_bias = "encoder_layer_%d_post_ffn_layer_norm_bias" % _i
+
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+
+        backbone_model._encoder._sub_layers[
+            "layer_%d" % _i].postprocesser2.layer_norm_0.set_dict({
+                "weight": _param_weight,
+                "bias": _param_bias
+            })
+        if verbose:
+            print("INIT layer norm in FFN at %d layer" % _i)
+
+        # init FFN 1
+        _param_weight = "encoder_layer_%d_ffn_fc_0.w_0" % _i
+        _param_bias = "encoder_layer_%d_ffn_fc_0.b_0" % _i
+
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+
+        backbone_model._encoder._sub_layers["layer_%d" % _i].ffn.fc1.set_dict({
+            "weight": _param_weight,
+            "bias": _param_bias
+        })
+        if verbose:
+            print("INIT FFN-1 at %d layer" % _i)
+
+        # init FFN 2
+        _param_weight = "encoder_layer_%d_ffn_fc_1.w_0" % _i
+        _param_bias = "encoder_layer_%d_ffn_fc_1.b_0" % _i
+
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+
+        backbone_model._encoder._sub_layers["layer_%d" % _i].ffn.fc2.set_dict({
+            "weight": _param_weight,
+            "bias": _param_bias
+        })
+        if verbose:
+            print("INIT FFN-2 at %d layer" % _i)
+
+    return True
--- a/hapi/text/text.py
+++ b/hapi/text/text.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import six
+import sys
+if six.PY2:
+    reload(sys)
+    sys.setdefaultencoding('utf8')
+
+import ast
+import time
+import argparse as argparse
+import numpy as np
+import multiprocessing
+
+import collections
+import copy
+import six
+import sys
+from functools import partial, reduce
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers.utils as utils
+from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
+from paddle.fluid.dygraph import to_variable, Embedding, Linear, LayerNorm, GRUUnit
+from paddle.fluid.data_feeder import convert_dtype
+
+from paddle.fluid import layers
+from paddle.fluid.dygraph import Layer
+from paddle.fluid.layers import BeamSearchDecoder
+
+__all__ = [
+    'RNNCell', 'BasicLSTMCell', 'BasicGRUCell', 'RNN', 'DynamicDecode',
+    'BeamSearchDecoder', 'MultiHeadAttention', 'FFN',
+    'TransformerEncoderLayer', 'TransformerEncoder', 'TransformerDecoderLayer',
+    'TransformerDecoder', 'TransformerBeamSearchDecoder', 'DynamicGRU',
+    'BiGRU', 'Linear_chain_crf', 'Crf_decoding', 'SequenceTagging'
+]
+
+
+class RNNCell(Layer):
+    def get_initial_states(self,
+                           batch_ref,
+                           shape=None,
+                           dtype=None,
+                           init_value=0,
+                           batch_dim_idx=0):
+        """
+        Generate initialized states according to provided shape, data type and
+        value.
+
+        Parameters:
+            batch_ref: A (possibly nested structure of) tensor variable[s].
+                The first dimension of the tensor will be used as batch size to
+                initialize states.
+            shape: A (possiblely nested structure of) shape[s], where a shape is
+                represented as a list/tuple of integer). -1(for batch size) will
+                beautomatically inserted if shape is not started with it. If None,
+                property `state_shape` will be used. The default value is None.
+            dtype: A (possiblely nested structure of) data type[s]. The structure
+                must be same as that of `shape`, except when all tensors' in states
+                has the same data type, a single data type can be used. If None and
+                property `cell.state_shape` is not available, float32 will be used
+                as the data type. The default value is None.
+            init_value: A float value used to initialize states.
+
+        Returns:
+            Variable: tensor variable[s] packed in the same structure provided \
+                by shape, representing the initialized states.
+        """
+        # TODO: use inputs and batch_size
+        batch_ref = flatten(batch_ref)[0]
+
+        def _is_shape_sequence(seq):
+            if sys.version_info < (3, ):
+                integer_types = (
+                    int,
+                    long, )
+            else:
+                integer_types = (int, )
+            """For shape, list/tuple of integer is the finest-grained objection"""
+            if (isinstance(seq, list) or isinstance(seq, tuple)):
+                if reduce(
+                        lambda flag, x: isinstance(x, integer_types) and flag,
+                        seq, True):
+                    return False
+            # TODO: Add check for the illegal
+            if isinstance(seq, dict):
+                return True
+            return (isinstance(seq, collections.Sequence) and
+                    not isinstance(seq, six.string_types))
+
+        class Shape(object):
+            def __init__(self, shape):
+                self.shape = shape if shape[0] == -1 else ([-1] + list(shape))
+
+        # nested structure of shapes
+        states_shapes = self.state_shape if shape is None else shape
+        is_sequence_ori = utils.is_sequence
+        utils.is_sequence = _is_shape_sequence
+        states_shapes = map_structure(lambda shape: Shape(shape),
+                                      states_shapes)
+        utils.is_sequence = is_sequence_ori
+
+        # nested structure of dtypes
+        try:
+            states_dtypes = self.state_dtype if dtype is None else dtype
+        except NotImplementedError:  # use fp32 as default
+            states_dtypes = "float32"
+        if len(flatten(states_dtypes)) == 1:
+            dtype = flatten(states_dtypes)[0]
+            states_dtypes = map_structure(lambda shape: dtype, states_shapes)
+
+        init_states = map_structure(
+            lambda shape, dtype: fluid.layers.fill_constant_batch_size_like(
+                input=batch_ref,
+                shape=shape.shape,
+                dtype=dtype,
+                value=init_value,
+                input_dim_idx=batch_dim_idx), states_shapes, states_dtypes)
+        return init_states
+
+    @property
+    def state_shape(self):
+        """
+        Abstract method (property).
+        Used to initialize states.
+        A (possiblely nested structure of) shape[s], where a shape is represented
+        as a list/tuple of integers (-1 for batch size would be automatically
+        inserted into a shape if shape is not started with it).
+        Not necessary to be implemented if states are not initialized by
+        `get_initial_states` or the `shape` argument is provided when using
+        `get_initial_states`.
+        """
+        raise NotImplementedError(
+            "Please add implementaion for `state_shape` in the used cell.")
+
+    @property
+    def state_dtype(self):
+        """
+        Abstract method (property).
+        Used to initialize states.
+        A (possiblely nested structure of) data types[s]. The structure must be
+        same as that of `shape`, except when all tensors' in states has the same
+        data type, a signle data type can be used.
+        Not necessary to be implemented if states are not initialized
+        by `get_initial_states` or the `dtype` argument is provided when using
+        `get_initial_states`.
+        """
+        raise NotImplementedError(
+            "Please add implementaion for `state_dtype` in the used cell.")
+
+
+class BasicLSTMCell(RNNCell):
+    """
+    ****
+    BasicLSTMUnit class, Using basic operator to build LSTM
+    The algorithm can be described as the code below.
+        .. math::
+           i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + b_i)
+           f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + b_f + forget_bias )
+           o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + b_o)
+           \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + b_c)
+           c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t}
+           h_t &= o_t \odot tanh(c_t)
+        - $W$ terms denote weight matrices (e.g. $W_{ix}$ is the matrix
+          of weights from the input gate to the input)
+        - The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
+        - sigmoid is the logistic sigmoid function.
+        - $i, f, o$ and $c$ are the input gate, forget gate, output gate,
+          and cell activation vectors, respectively, all of which have the same size as
+          the cell output activation vector $h$.
+        - The :math:`\odot` is the element-wise product of the vectors.
+        - :math:`tanh` is the activation functions.
+        - :math:`\\tilde{c_t}` is also called candidate hidden state,
+          which is computed based on the current input and the previous hidden state.
+    Args:
+        name_scope(string) : The name scope used to identify parameter and bias name
+        hidden_size (integer): The hidden size used in the Unit.
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+            weight matrix. Note:
+            If it is set to None or one attribute of ParamAttr, lstm_unit will
+            create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr|None): The parameter attribute for the bias
+            of LSTM unit.
+            If it is set to None or one attribute of ParamAttr, lstm_unit will
+            create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized as zero. Default: None.
+        gate_activation (function|None): The activation function for gates (actGate).
+                                  Default: 'fluid.layers.sigmoid'
+        activation (function|None): The activation function for cells (actNode).
+                             Default: 'fluid.layers.tanh'
+        forget_bias(float|1.0): forget bias used when computing forget gate
+        dtype(string): data type used in this unit
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 param_attr=None,
+                 bias_attr=None,
+                 gate_activation=None,
+                 activation=None,
+                 forget_bias=1.0,
+                 dtype='float32',
+                 forget_gate_weights={"w": None,
+                                      "h": None,
+                                      "b": None},
+                 input_gate_weights={"w": None,
+                                     "h": None,
+                                     "b": None},
+                 output_gate_weights={"w": None,
+                                      "h": None,
+                                      "b": None},
+                 cell_weights={"w": None,
+                               "h": None,
+                               "b": None}):
+        super(BasicLSTMCell, self).__init__()
+
+        self._hidden_size = hidden_size
+        self._param_attr = param_attr
+        self._bias_attr = bias_attr
+        self._gate_activation = gate_activation or layers.sigmoid
+        self._activation = activation or layers.tanh
+        self._forget_bias = layers.fill_constant(
+            [1], dtype=dtype, value=forget_bias)
+        self._forget_bias.stop_gradient = False
+        self._dtype = dtype
+        self._input_size = input_size
+
+        assert isinstance(forget_gate_weights, dict)
+        assert isinstance(input_gate_weights, dict)
+        assert isinstance(output_gate_weights, dict)
+        assert isinstance(cell_weights, dict)
+
+        # forgot get parameters
+        if "w" in forget_gate_weights and forget_gate_weights["w"] is not None:
+            self.fg_w = forget_gate_weights["w"]
+        else:
+            if self._param_attr is not None and self._param_attr.name is not None:
+                tmp_param_attr = copy.deepcopy(self._param_attr)
+                tmp_param_attr.name += "_forget_gate_w"
+            else:
+                tmp_param_attr = self._param_attr
+            self.fg_w = self.create_parameter(
+                attr=tmp_param_attr,
+                shape=[self._input_size, self._hidden_size],
+                dtype=self._dtype)
+
+        if "h" in forget_gate_weights and forget_gate_weights["h"] is not None:
+            self.fg_h = forget_gate_weights["h"]
+        else:
+            if self._param_attr is not None and self._param_attr.name is not None:
+                tmp_param_attr = copy.deepcopy(self._param_attr)
+                tmp_param_attr.name += "_forget_gate_h"
+            else:
+                tmp_param_attr = self._param_attr
+            self.fg_h = self.create_parameter(
+                attr=tmp_param_attr,
+                shape=[self._hidden_size, self._hidden_size],
+                dtype=self._dtype)
+
+        if "b" in forget_gate_weights and forget_gate_weights["b"] is not None:
+            self.fg_b = forget_gate_weights["b"]
+        else:
+            if self._bias_attr is not None and self._bias_attr.name is not None:
+                tmp_param_attr = copy.deepcopy(self._bias_attr)
+                tmp_param_attr.name += "_forget_gate_b"
+            else:
+                tmp_param_attr = self._bias_attr
+            self.fg_b = self.create_parameter(
+                attr=tmp_param_attr,
+                shape=[self._hidden_size],
+                dtype=self._dtype,
+                is_bias=True)
+
+        # input gate parameters
+        if "w" in input_gate_weights and input_gate_weights["w"] is not None:
+            self.ig_w = input_gate_weights["w"]
+        else:
+            if self._param_attr is not None and self._param_attr.name is not None:
+                tmp_param_attr = copy.deepcopy(self._param_attr)
+                tmp_param_attr.name += "_input_gate_w"
+            else:
+                tmp_param_attr = self._param_attr
+
+            self.ig_w = self.create_parameter(
+                attr=tmp_param_attr,
+                shape=[self._input_size, self._hidden_size],
+                dtype=self._dtype)
+
+        if "h" in input_gate_weights and input_gate_weights["h"] is not None:
+            self.ig_h = input_gate_weights["h"]
+        else:
+            if self._param_attr is not None and self._param_attr.name is not None:
+                tmp_param_attr = copy.deepcopy(self._param_attr)
+                tmp_param_attr.name += "_input_gate_h"
+            else:
+                tmp_param_attr = self._param_attr
+
+            self.ig_h = self.create_parameter(
+                attr=tmp_param_attr,
+                shape=[self._hidden_size, self._hidden_size],
+                dtype=self._dtype)
+
+        if "b" in input_gate_weights and input_gate_weights["b"] is not None:
+            self.ig_b = input_gate_weights["b"]
+        else:
+            if self._bias_attr is not None and self._bias_attr.name is not None:
+                tmp_param_attr = copy.deepcopy(self._bias_attr)
+                tmp_param_attr.name += "_input_gate_b"
+            else:
+                tmp_param_attr = self._bias_attr
+            self.ig_b = self.create_parameter(
+                attr=tmp_param_attr,
+                shape=[self._hidden_size],
+                dtype=self._dtype,
+                is_bias=True)
+
+        # output gate parameters
+        if "w" in output_gate_weights and output_gate_weights["w"] is not None:
+            self.og_w = output_gate_weights["w"]
+        else:
+            if self._param_attr is not None and self._param_attr.name is not None:
+                tmp_param_attr = copy.deepcopy(self._param_attr)
+                tmp_param_attr.name += "_output_gate_w"
+            else:
+                tmp_param_attr = self._param_attr
+            self.og_w = self.create_parameter(
+                attr=tmp_param_attr,
+                shape=[self._input_size, self._hidden_size],
+                dtype=self._dtype)
+
+        if "h" in output_gate_weights and output_gate_weights["h"] is not None:
+            self.og_h = output_gate_weights["h"]
+        else:
+            if self._param_attr is not None and self._param_attr.name is not None:
+                tmp_param_attr = copy.deepcopy(self._param_attr)
+                tmp_param_attr.name += "_output_gate_h"
+            else:
+                tmp_param_attr = self._param_attr
+            self.og_h = self.create_parameter(
+                attr=tmp_param_attr,
+                shape=[self._hidden_size, self._hidden_size],
+                dtype=self._dtype)
+
+        if "b" in output_gate_weights and output_gate_weights["b"] is not None:
+            self.og_b = output_gate_weights["b"]
+        else:
+            if self._bias_attr is not None and self._bias_attr.name is not None:
+                tmp_param_attr = copy.deepcopy(self._bias_attr)
+                tmp_param_attr.name += "_output_gate_b"
+            else:
+                tmp_param_attr = self._bias_attr
+            self.og_b = self.create_parameter(
+                attr=tmp_param_attr,
+                shape=[self._hidden_size],
+                dtype=self._dtype,
+                is_bias=True)
+
+        # cell parameters
+        if "w" in cell_weights and cell_weights["w"] is not None:
+            self.c_w = cell_weights["w"]
+        else:
+            if self._param_attr is not None and self._param_attr.name is not None:
+                tmp_param_attr = copy.deepcopy(self._param_attr)
+                tmp_param_attr.name += "_cell_w"
+            else:
+                tmp_param_attr = self._param_attr
+
+            self.c_w = self.create_parameter(
+                attr=tmp_param_attr,
+                shape=[self._input_size, self._hidden_size],
+                dtype=self._dtype)
+
+        if "h" in cell_weights and cell_weights["h"] is not None:
+            self.c_h = cell_weights["h"]
+        else:
+            if self._param_attr is not None and self._param_attr.name is not None:
+                tmp_param_attr = copy.deepcopy(self._param_attr)
+                tmp_param_attr.name += "_cell_h"
+            else:
+                tmp_param_attr = self._param_attr
+            self.c_h = self.create_parameter(
+                attr=tmp_param_attr,
+                shape=[self._hidden_size, self._hidden_size],
+                dtype=self._dtype)
+
+        if "b" in cell_weights and cell_weights["b"] is not None:
+            self.c_b = cell_weights["b"]
+        else:
+            if self._bias_attr is not None and self._bias_attr.name is not None:
+                tmp_param_attr = copy.deepcopy(self._bias_attr)
+                tmp_param_attr.name += "_cell_b"
+            else:
+                tmp_param_attr = self._bias_attr
+            self.c_b = self.create_parameter(
+                attr=tmp_param_attr,
+                shape=[self._hidden_size],
+                dtype=self._dtype,
+                is_bias=True)
+
+        # the weight is concated here in order to make the computation more efficent.
+        weight_w = fluid.layers.concat(
+            [self.ig_w, self.c_w, self.fg_w, self.og_w], axis=-1)
+        weight_h = fluid.layers.concat(
+            [self.ig_h, self.c_h, self.fg_h, self.og_h], axis=-1)
+        self._weight = fluid.layers.concat([weight_w, weight_h], axis=0)
+
+        self._bias = fluid.layers.concat(
+            [self.ig_b, self.c_b, self.fg_b, self.og_b])
+
+    def forward(self, input, state):
+        pre_hidden, pre_cell = state
+        concat_input_hidden = layers.concat([input, pre_hidden], 1)
+        gate_input = layers.matmul(x=concat_input_hidden, y=self._weight)
+
+        gate_input = layers.elementwise_add(gate_input, self._bias)
+        i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
+        new_cell = layers.elementwise_add(
+            layers.elementwise_mul(
+                pre_cell,
+                layers.sigmoid(layers.elementwise_add(f, self._forget_bias))),
+            layers.elementwise_mul(layers.sigmoid(i), layers.tanh(j)))
+        new_hidden = layers.tanh(new_cell) * layers.sigmoid(o)
+
+        return new_hidden, [new_hidden, new_cell]
+
+    @property
+    def state_shape(self):
+        return [[self._hidden_size], [self._hidden_size]]
+
+
+class BasicGRUCell(RNNCell):
+    """
+    ****
+    BasicGRUUnit class, using basic operators to build GRU
+    The algorithm can be described as the equations below.
+
+        .. math::
+            u_t & = actGate(W_ux xu_{t} + W_uh h_{t-1} + b_u)
+
+            r_t & = actGate(W_rx xr_{t} + W_rh h_{t-1} + b_r)
+
+            m_t & = actNode(W_cx xm_t + W_ch dot(r_t, h_{t-1}) + b_m)
+
+            h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t)
+
+    Args:
+        hidden_size (integer): The hidden size used in the Unit.
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+            weight matrix. Note:
+            If it is set to None or one attribute of ParamAttr, gru_unit will
+            create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr|None): The parameter attribute for the bias
+            of GRU unit.
+            If it is set to None or one attribute of ParamAttr, gru_unit will 
+            create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
+        gate_activation (function|None): The activation function for gates (actGate).
+                                  Default: 'fluid.layers.sigmoid'
+        activation (function|None): The activation function for cell (actNode).
+                             Default: 'fluid.layers.tanh'
+        dtype(string): data type used in this unit
+    """
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 param_attr=None,
+                 bias_attr=None,
+                 gate_activation=None,
+                 activation=None,
+                 dtype='float32',
+                 update_gate_weights={"w": None,
+                                      "h": None,
+                                      "b": None},
+                 reset_gate_weights={"w": None,
+                                     "h": None,
+                                     "b": None},
+                 cell_weights={"w": None,
+                               "h": None,
+                               "b": None}):
+
+        super(BasicGRUCell, self).__init__()
+        self._input_size = input_size
+        self._hiden_size = hidden_size
+        self._param_attr = param_attr
+        self._bias_attr = bias_attr
+        self._gate_activation = gate_activation or layers.sigmoid
+        self._activation = activation or layers.tanh
+        self._dtype = dtype
+
+        assert isinstance(update_gate_weights, dict)
+        assert isinstance(reset_gate_weights, dict)
+        assert isinstance(cell_weights, dict)
+
+        if self._param_attr is not None and self._param_attr.name is not None:
+            gate_param_attr = copy.deepcopy(self._param_attr)
+            candidate_param_attr = copy.deepcopy(self._param_attr)
+            gate_param_attr.name += "_gate"
+            candidate_param_attr.name += "_candidate"
+        else:
+            gate_param_attr = self._param_attr
+            candidate_param_attr = self._param_attr
+
+        if self._bias_attr is not None and self._bias_attr.name is not None:
+            gate_bias_attr = copy.deepcopy(self._bias_attr)
+            candidate_bias_attr = copy.deepcopy(self._bias_attr)
+            gate_bias_attr.name += "_gate"
+            candidate_bias_attr.name += "_candidate"
+        else:
+            gate_bias_attr = self._bias_attr
+            candidate_bias_attr = self._bias_attr
+
+        # create the parameters of gates in gru
+        if "w" in update_gate_weights and update_gate_weights["w"] is not None:
+            self.ug_w = update_gate_weights["w"]
+        else:
+            if gate_param_attr is not None and gate_param_attr.name is not None:
+                tmp_param_attr = copy.deepcopy(gate_param_attr)
+                tmp_param_attr.name += "_update_gate_w"
+            else:
+                tmp_param_attr = gate_param_attr
+            self.ug_w = self.create_parameter(
+                attr=tmp_param_attr,
+                shape=[self._input_size, self._hidden_size],
+                dtype=self._dtype)
+
+        if "h" in update_gate_weights and update_gate_weights["h"] is not None:
+            self.ug_h = update_gate_weights["h"]
+        else:
+            if gate_param_attr is not None and gate_param_attr.name is not None:
+                tmp_param_attr = copy.deepcopy(gate_param_attr)
+                tmp_param_attr.name += "_update_gate_h"
+            else:
+                tmp_param_attr = gate_param_attr
+            self.ug_h = self.create_parameter(
+                attr=tmp_param_attr,
+                shape=[self._hidden_size, self._hidden_size],
+                dtype=self._dtype)
+
+        if "b" in update_gate_weights and update_gate_weights["b"] is not None:
+            self.ug_b = update_gate_weights["b"]
+        else:
+            if gate_bias_attr is not None and gate_bias_attr.name is not None:
+                tmp_param_attr = copy.deepcopy(gate_bias_attr)
+                tmp_param_attr.name += "_update_gate_b"
+            else:
+                tmp_param_attr = gate_bias_attr
+            self.ug_b = self.create_parameter(
+                attr=tmp_param_attr,
+                shape=[self._hidden_size],
+                dtype=self._dtype,
+                is_bias=True)
+
+        # reset gate parameters
+        if "w" in reset_gate_weights and reset_gate_weights["w"] is not None:
+            self.rg_w = reset_gate_weights["w"]
+        else:
+            if gate_param_attr is not None and gate_param_attr.name is not None:
+                tmp_param_attr = copy.deepcopy(gate_param_attr)
+                tmp_param_attr.name += "_reset_gate_w"
+            else:
+                tmp_param_attr = gate_param_attr
+            self.rg_w = self.create_parameter(
+                attr=tmp_param_attr,
+                shape=[self._input_size, self._hidden_size],
+                dtype=self._dtype)
+
+        if "h" in reset_gate_weights and reset_gate_weights["h"] is not None:
+            self.rg_h = reset_gate_weights["h"]
+        else:
+            if gate_param_attr is not None and gate_param_attr.name is not None:
+                tmp_param_attr = copy.deepcopy(gate_param_attr)
+                tmp_param_attr.name += "_reset_gate_h"
+            else:
+                tmp_param_attr = gate_param_attr
+            self.rg_h = self.create_parameter(
+                attr=tmp_param_attr,
+                shape=[self._hidden_size, self._hidden_size],
+                dtype=self._dtype)
+
+        if "b" in reset_gate_weights and reset_gate_weights["b"] is not None:
+            self.rg_b = reused_params["b"]
+        else:
+            if gate_bias_attr is not None and gate_bias_attr.name is not None:
+                tmp_param_attr = copy.deepcopy(gate_bias_attr)
+                tmp_param_attr.name += "_reset_gate_b"
+            else:
+                tmp_param_attr = gate_bias_attr
+            self.rg_b = self.create_parameter(
+                attr=tmp_param_attr,
+                shape=[self._hidden_size],
+                dtype=self._dtype,
+                is_bias=True)
+
+        # cell parameters
+        if "w" in cell_weights and cell_weights["w"] is not None:
+            self.c_w = cell_weights["w"]
+        else:
+            if candidate_param_attr is not None and candidate_param_attr.name is not None:
+                tmp_param_attr = copy.deepcopy(candidate_param_attr)
+                tmp_param_attr.name += "_cell_w"
+            else:
+                tmp_param_attr = gate_param_attr
+
+            self.c_w = self.create_parameter(
+                attr=tmp_param_attr,
+                shape=[self._input_size, self._hidden_size],
+                dtype=self._dtype)
+
+        if "h" in cell_weights and cell_weights["h"] is not None:
+            self.c_h = cell_weights["h"]
+        else:
+            if candidate_param_attr is not None and candidate_param_attr.name is not None:
+                tmp_param_attr = copy.deepcopy(candidate_param_attr)
+                tmp_param_attr.name += "_cell_h"
+            else:
+                tmp_param_attr = gate_param_attr
+            self.c_h = self.create_parameter(
+                attr=tmp_param_attr,
+                shape=[self._hidden_size, self._hidden_size],
+                dtype=self._dtype)
+
+        if "b" in cell_weights and cell_weights["b"] is not None:
+            self.c_b = cell_weights["b"]
+        else:
+            if candidate_bias_attr is not None and candidate_bias_attr.name is not None:
+                tmp_param_attr = copy.deepcopy(candidate_bias_attr)
+                tmp_param_attr.name += "_cell_b"
+            else:
+                tmp_param_attr = gate_bias_attr
+            self.c_b = self.create_parameter(
+                attr=tmp_param_attr,
+                shape=[self._hidden_size],
+                dtype=self._dtype,
+                is_bias=True)
+
+        rg_weights = layers.concat([self.rg_w, self.rg_h], axis=0)
+        ug_weights = layers.concat([self.ug_w, self.ug_h], axis=0)
+        self._gate_weight = layers.concat([rg_weights, ug_weights], axis=-1)
+
+        self._candidate_weight = layers.concat([self.c_w, self.c_h], axis=0)
+
+        self._gate_bias = layers.concat([self.rg_b, self.ug_b], axis=0)
+
+        self._candidate_bias = self.c_b
+
+    def forward(self, input, state):
+        pre_hidden = state
+        concat_input_hidden = layers.concat([input, pre_hidden], axis=1)
+
+        gate_input = layers.matmul(x=concat_input_hidden, y=self._gate_weight)
+
+        gate_input = layers.elementwise_add(gate_input, self._gate_bias)
+
+        gate_input = self._gate_activation(gate_input)
+        r, u = layers.split(gate_input, num_or_sections=2, dim=1)
+
+        r_hidden = r * pre_hidden
+
+        candidate = layers.matmul(
+            layers.concat([input, r_hidden], 1), self._candidate_weight)
+        candidate = layers.elementwise_add(candidate, self._candidate_bias)
+
+        c = self._activation(candidate)
+        new_hidden = u * pre_hidden + (1 - u) * c
+
+        return new_hidden
+
+    @property
+    def state_shape(self):
+        return [self._hidden_size]
+
+
+class RNN(fluid.dygraph.Layer):
+    def __init__(self, cell, is_reverse=False, time_major=False):
+        super(RNN, self).__init__()
+        self.cell = cell
+        if not hasattr(self.cell, "call"):
+            self.cell.call = self.cell.forward
+        self.is_reverse = is_reverse
+        self.time_major = time_major
+        self.batch_index, self.time_step_index = (1, 0) if time_major else (0,
+                                                                            1)
+
+    def forward(self,
+                inputs,
+                initial_states=None,
+                sequence_length=None,
+                **kwargs):
+        if fluid.in_dygraph_mode():
+
+            class ArrayWrapper(object):
+                def __init__(self, x):
+                    self.array = [x]
+
+                def append(self, x):
+                    self.array.append(x)
+                    return self
+
+            def _maybe_copy(state, new_state, step_mask):
+                # TODO: use where_op
+                new_state = fluid.layers.elementwise_mul(
+                    new_state, step_mask,
+                    axis=0) - fluid.layers.elementwise_mul(
+                        state, (step_mask - 1), axis=0)
+                return new_state
+
+            flat_inputs = flatten(inputs)
+            batch_size, time_steps = (
+                flat_inputs[0].shape[self.batch_index],
+                flat_inputs[0].shape[self.time_step_index])
+
+            if initial_states is None:
+                initial_states = self.cell.get_initial_states(
+                    batch_ref=inputs, batch_dim_idx=self.batch_index)
+
+            if not self.time_major:
+                inputs = map_structure(
+                    lambda x: fluid.layers.transpose(x, [1, 0] + list(
+                        range(2, len(x.shape)))), inputs)
+
+            if sequence_length:
+                mask = fluid.layers.sequence_mask(
+                    sequence_length,
+                    maxlen=time_steps,
+                    dtype=flatten(initial_states)[0].dtype)
+                mask = fluid.layers.transpose(mask, [1, 0])
+
+            if self.is_reverse:
+                inputs = map_structure(
+                    lambda x: fluid.layers.reverse(x, axis=[0]), inputs)
+                mask = fluid.layers.reverse(
+                    mask, axis=[0]) if sequence_length else None
+
+            states = initial_states
+            outputs = []
+            for i in range(time_steps):
+                step_inputs = map_structure(lambda x: x[i], inputs)
+                step_outputs, new_states = self.cell(step_inputs, states,
+                                                     **kwargs)
+                if sequence_length:
+                    new_states = map_structure(
+                        partial(
+                            _maybe_copy, step_mask=mask[i]),
+                        states,
+                        new_states)
+                states = new_states
+                outputs = map_structure(
+                    lambda x: ArrayWrapper(x),
+                    step_outputs) if i == 0 else map_structure(
+                        lambda x, x_array: x_array.append(x), step_outputs,
+                        outputs)
+
+            final_outputs = map_structure(
+                lambda x: fluid.layers.stack(x.array,
+                                             axis=self.time_step_index),
+                outputs)
+
+            if self.is_reverse:
+                final_outputs = map_structure(
+                    lambda x: fluid.layers.reverse(x,
+                                                   axis=self.time_step_index),
+                    final_outputs)
+
+            final_states = new_states
+        else:
+            final_outputs, final_states = fluid.layers.rnn(
+                self.cell,
+                inputs,
+                initial_states=initial_states,
+                sequence_length=sequence_length,
+                time_major=self.time_major,
+                is_reverse=self.is_reverse,
+                **kwargs)
+        return final_outputs, final_states
+
+
+class DynamicDecode(Layer):
+    def __init__(self,
+                 decoder,
+                 max_step_num=None,
+                 output_time_major=False,
+                 impute_finished=False,
+                 is_test=False,
+                 return_length=False):
+        super(DynamicDecode, self).__init__()
+        self.decoder = decoder
+        self.max_step_num = max_step_num
+        self.output_time_major = output_time_major
+        self.impute_finished = impute_finished
+        self.is_test = is_test
+        self.return_length = return_length
+
+    def forward(self, inits=None, **kwargs):
+        if fluid.in_dygraph_mode():
+
+            class ArrayWrapper(object):
+                def __init__(self, x):
+                    self.array = [x]
+
+                def append(self, x):
+                    self.array.append(x)
+                    return self
+
+                def __getitem__(self, item):
+                    return self.array.__getitem__(item)
+
+            def _maybe_copy(state, new_state, step_mask):
+                # TODO: use where_op
+                state_dtype = state.dtype
+                if convert_dtype(state_dtype) in ["bool"]:
+                    state = layers.cast(state, dtype="float32")
+                    new_state = layers.cast(new_state, dtype="float32")
+                if step_mask.dtype != state.dtype:
+                    step_mask = layers.cast(step_mask, dtype=state.dtype)
+                    # otherwise, renamed bool gradients of would be summed up leading
+                    # to sum(bool) error.
+                    step_mask.stop_gradient = True
+                new_state = layers.elementwise_mul(
+                    state, step_mask, axis=0) - layers.elementwise_mul(
+                        new_state, (step_mask - 1), axis=0)
+                if convert_dtype(state_dtype) in ["bool"]:
+                    new_state = layers.cast(new_state, dtype=state_dtype)
+                return new_state
+
+            initial_inputs, initial_states, initial_finished = self.decoder.initialize(
+                inits)
+            inputs, states, finished = (initial_inputs, initial_states,
+                                        initial_finished)
+            cond = layers.logical_not((layers.reduce_all(initial_finished)))
+            sequence_lengths = layers.cast(
+                layers.zeros_like(initial_finished), "int64")
+            outputs = None
+
+            step_idx = 0
+            step_idx_tensor = layers.fill_constant(
+                shape=[1], dtype="int64", value=step_idx)
+            while cond.numpy():
+                (step_outputs, next_states, next_inputs,
+                 next_finished) = self.decoder.step(step_idx_tensor, inputs,
+                                                    states, **kwargs)
+                if not self.decoder.tracks_own_finished:
+                    # BeamSearchDecoder would track it own finished, since
+                    # beams would be reordered and the finished status of each
+                    # entry might change. Otherwise, perform logical OR which
+                    # would not change the already finished.
+                    next_finished = layers.logical_or(next_finished, finished)
+                    # To confirm states.finished/finished be consistent with
+                    # next_finished.
+                    layers.assign(next_finished, finished)
+
+                next_sequence_lengths = layers.elementwise_add(
+                    sequence_lengths,
+                    layers.cast(
+                        layers.logical_not(finished), sequence_lengths.dtype))
+
+                if self.impute_finished:  # rectify the states for the finished.
+                    next_states = map_structure(
+                        lambda x, y: _maybe_copy(x, y, finished), states,
+                        next_states)
+                outputs = map_structure(
+                    lambda x: ArrayWrapper(x),
+                    step_outputs) if step_idx == 0 else map_structure(
+                        lambda x, x_array: x_array.append(x), step_outputs,
+                        outputs)
+                inputs, states, finished, sequence_lengths = (
+                    next_inputs, next_states, next_finished,
+                    next_sequence_lengths)
+
+                layers.increment(x=step_idx_tensor, value=1.0, in_place=True)
+                step_idx += 1
+
+                layers.logical_not(layers.reduce_all(finished), cond)
+                if self.max_step_num is not None and step_idx > self.max_step_num:
+                    break
+
+            final_outputs = map_structure(
+                lambda x: fluid.layers.stack(x.array, axis=0), outputs)
+            final_states = states
+
+            try:
+                final_outputs, final_states = self.decoder.finalize(
+                    final_outputs, final_states, sequence_lengths)
+            except NotImplementedError:
+                pass
+
+            if not self.output_time_major:
+                final_outputs = map_structure(
+                    lambda x: layers.transpose(x, [1, 0] + list(
+                        range(2, len(x.shape)))), final_outputs)
+
+            return (final_outputs, final_states,
+                    sequence_lengths) if self.return_length else (
+                        final_outputs, final_states)
+        else:
+            return fluid.layers.dynamic_decode(
+                self.decoder,
+                inits,
+                max_step_num=self.max_step_num,
+                output_time_major=self.output_time_major,
+                impute_finished=self.impute_finished,
+                is_test=self.is_test,
+                return_length=self.return_length,
+                **kwargs)
+
+
+class TransfomerCell(object):
+    """
+    Let inputs=(trg_word, trg_pos), states=cache to make Transformer can be
+    used as RNNCell
+    """
+
+    def __init__(self, decoder):
+        self.decoder = decoder
+
+    def __call__(self, inputs, states, trg_src_attn_bias, enc_output,
+                 static_caches):
+        trg_word, trg_pos = inputs
+        for cache, static_cache in zip(states, static_caches):
+            cache.update(static_cache)
+        logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias,
+                              enc_output, states)
+        new_states = [{"k": cache["k"], "v": cache["v"]} for cache in states]
+        return logits, new_states
+
+
+class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
+    def __init__(self, cell, start_token, end_token, beam_size,
+                 var_dim_in_state):
+        super(TransformerBeamSearchDecoder,
+              self).__init__(cell, start_token, end_token, beam_size)
+        self.cell = cell
+        self.var_dim_in_state = var_dim_in_state
+
+    def _merge_batch_beams_with_var_dim(self, x):
+        # init length of cache is 0, and it increases with decoding carrying on,
+        # thus need to reshape elaborately
+        var_dim_in_state = self.var_dim_in_state + 1  # count in beam dim
+        x = layers.transpose(x,
+                             list(range(var_dim_in_state, len(x.shape))) +
+                             list(range(0, var_dim_in_state)))
+        x = layers.reshape(
+            x, [0] * (len(x.shape) - var_dim_in_state
+                      ) + [self.batch_size * self.beam_size] +
+            [int(size) for size in x.shape[-var_dim_in_state + 2:]])
+        x = layers.transpose(
+            x,
+            list(range((len(x.shape) + 1 - var_dim_in_state), len(x.shape))) +
+            list(range(0, (len(x.shape) + 1 - var_dim_in_state))))
+        return x
+
+    def _split_batch_beams_with_var_dim(self, x):
+        var_dim_size = layers.shape(x)[self.var_dim_in_state]
+        x = layers.reshape(
+            x, [-1, self.beam_size] +
+            [int(size)
+             for size in x.shape[1:self.var_dim_in_state]] + [var_dim_size] +
+            [int(size) for size in x.shape[self.var_dim_in_state + 1:]])
+        return x
+
+    def step(self, time, inputs, states, **kwargs):
+        # compared to RNN, Transformer has 3D data at every decoding step
+        inputs = layers.reshape(inputs, [-1, 1])  # token
+        pos = layers.ones_like(inputs) * time  # pos
+        cell_states = map_structure(self._merge_batch_beams_with_var_dim,
+                                    states.cell_states)
+
+        cell_outputs, next_cell_states = self.cell((inputs, pos), cell_states,
+                                                   **kwargs)
+        cell_outputs = map_structure(self._split_batch_beams, cell_outputs)
+        next_cell_states = map_structure(self._split_batch_beams_with_var_dim,
+                                         next_cell_states)
+
+        beam_search_output, beam_search_state = self._beam_search_step(
+            time=time,
+            logits=cell_outputs,
+            next_cell_states=next_cell_states,
+            beam_state=states)
+        next_inputs, finished = (beam_search_output.predicted_ids,
+                                 beam_search_state.finished)
+
+        return (beam_search_output, beam_search_state, next_inputs, finished)
+
+
+### Transformer Modules ###
+class PrePostProcessLayer(Layer):
+    """
+    PrePostProcessLayer
+    """
+
+    def __init__(self,
+                 process_cmd,
+                 d_model,
+                 dropout_rate,
+                 reused_layer_norm=None):
+        super(PrePostProcessLayer, self).__init__()
+        self.process_cmd = process_cmd
+        self.functors = []
+        for cmd in self.process_cmd:
+            if cmd == "a":  # add residual connection
+                self.functors.append(lambda x, y: x + y if y else x)
+            elif cmd == "n":  # add layer normalization
+                if reused_layer_norm is not None:
+                    layer_norm = reused_layer_norm
+                else:
+                    layer_norm = LayerNorm(
+                        normalized_shape=d_model,
+                        param_attr=fluid.ParamAttr(
+                            initializer=fluid.initializer.Constant(1.)),
+                        bias_attr=fluid.ParamAttr(
+                            initializer=fluid.initializer.Constant(0.)))
+
+                self.functors.append(
+                    self.add_sublayer(
+                        "layer_norm_%d" % len(
+                            self.sublayers(include_sublayers=False)),
+                        layer_norm))
+            elif cmd == "d":  # add dropout
+                self.functors.append(lambda x: layers.dropout(
+                    x, dropout_prob=dropout_rate, is_test=False)
+                                     if dropout_rate else x)
+
+    def forward(self, x, residual=None):
+        for i, cmd in enumerate(self.process_cmd):
+            if cmd == "a":
+                x = self.functors[i](x, residual)
+            else:
+                x = self.functors[i](x)
+        return x
+
+
+class MultiHeadAttention(Layer):
+    """
+    Multi-Head Attention
+    """
+
+    def __init__(self,
+                 d_key,
+                 d_value,
+                 d_model,
+                 n_head=1,
+                 dropout_rate=0.0,
+                 reused_query_fc=None,
+                 reused_key_fc=None,
+                 reused_value_fc=None,
+                 reused_proj_fc=None):
+
+        super(MultiHeadAttention, self).__init__()
+        self.n_head = n_head
+        self.d_key = d_key
+        self.d_value = d_value
+        self.d_model = d_model
+        self.dropout_rate = dropout_rate
+
+        if reused_query_fc is not None:
+            self.q_fc = reused_query_fc
+        else:
+            self.q_fc = Linear(
+                input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
+        if reused_key_fc is not None:
+            self.k_fc = reused_key_fc
+        else:
+            self.k_fc = Linear(
+                input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
+        if reused_value_fc is not None:
+            self.v_fc = reused_value_fc
+        else:
+            self.v_fc = Linear(
+                input_dim=d_model,
+                output_dim=d_value * n_head,
+                bias_attr=False)
+        if reused_proj_fc is not None:
+            self.proj_fc = reused_proj_fc
+        else:
+            self.proj_fc = Linear(
+                input_dim=d_value * n_head,
+                output_dim=d_model,
+                bias_attr=False)
+
+    def _prepare_qkv(self, queries, keys, values, cache=None):
+        if keys is None:  # self-attention
+            keys, values = queries, queries
+            static_kv = False
+        else:  # cross-attention
+            static_kv = True
+
+        q = self.q_fc(queries)
+        q = layers.reshape(x=q, shape=[0, 0, self.n_head, self.d_key])
+        q = layers.transpose(x=q, perm=[0, 2, 1, 3])
+
+        if cache is not None and static_kv and "static_k" in cache:
+            # for encoder-decoder attention in inference and has cached
+            k = cache["static_k"]
+            v = cache["static_v"]
+        else:
+            k = self.k_fc(keys)
+            v = self.v_fc(values)
+            k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
+            k = layers.transpose(x=k, perm=[0, 2, 1, 3])
+            v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
+            v = layers.transpose(x=v, perm=[0, 2, 1, 3])
+
+        if cache is not None:
+            if static_kv and not "static_k" in cache:
+                # for encoder-decoder attention in inference and has not cached
+                cache["static_k"], cache["static_v"] = k, v
+            elif not static_kv:
+                # for decoder self-attention in inference
+                cache_k, cache_v = cache["k"], cache["v"]
+                k = layers.concat([cache_k, k], axis=2)
+                v = layers.concat([cache_v, v], axis=2)
+                cache["k"], cache["v"] = k, v
+
+        return q, k, v
+
+    def forward(self, queries, keys, values, attn_bias, cache=None):
+        # compute q ,k ,v
+        q, k, v = self._prepare_qkv(queries, keys, values, cache)
+
+        # scale dot product attention
+        product = layers.matmul(
+            x=q, y=k, transpose_y=True, alpha=self.d_model**-0.5)
+        if attn_bias:
+            product += attn_bias
+        weights = layers.softmax(product)
+        if self.dropout_rate:
+            weights = layers.dropout(
+                weights, dropout_prob=self.dropout_rate, is_test=False)
+
+        out = layers.matmul(weights, v)
+
+        # combine heads
+        out = layers.transpose(out, perm=[0, 2, 1, 3])
+        out = layers.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
+
+        # project to output
+        out = self.proj_fc(out)
+        return out
+
+    def cal_kv(self, keys, values):
+        k = self.k_fc(keys)
+        v = self.v_fc(values)
+        k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
+        k = layers.transpose(x=k, perm=[0, 2, 1, 3])
+        v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
+        v = layers.transpose(x=v, perm=[0, 2, 1, 3])
+        return k, v
+
+
+class FFN(Layer):
+    """
+    Feed-Forward Network
+    """
+
+    def __init__(self,
+                 d_inner_hid,
+                 d_model,
+                 dropout_rate,
+                 fc1_act="relu",
+                 reused_fc1=None,
+                 reused_fc2=None):
+        super(FFN, self).__init__()
+        self.dropout_rate = dropout_rate
+        if reused_fc1 is not None:
+            self.fc1 = reused_fc1
+        else:
+            self.fc1 = Linear(
+                input_dim=d_model, output_dim=d_inner_hid, act=fc1_act)
+        if reused_fc2 is not None:
+            self.fc2 = reused_fc2
+        else:
+            self.fc2 = Linear(input_dim=d_inner_hid, output_dim=d_model)
+
+    def forward(self, x):
+        hidden = self.fc1(x)
+        if self.dropout_rate:
+            hidden = layers.dropout(
+                hidden, dropout_prob=self.dropout_rate, is_test=False)
+        out = self.fc2(hidden)
+        return out
+
+
+class TransformerEncoderLayer(Layer):
+    """
+    EncoderLayer
+    """
+
+    def __init__(self,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da",
+                 ffn_fc1_act="relu",
+                 reused_pre_selatt_layernorm=None,
+                 reused_multihead_att_weights={
+                     "reused_query_fc": None,
+                     "reused_key_fc": None,
+                     "reused_value_fc": None,
+                     "reused_proj_fc": None
+                 },
+                 reused_post_selfatt_layernorm=None,
+                 reused_pre_ffn_layernorm=None,
+                 reused_ffn_weights={"reused_fc1": None,
+                                     "reused_fc2": None},
+                 reused_post_ffn_layernorm=None):
+
+        super(TransformerEncoderLayer, self).__init__()
+
+        self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout,
+                                                 reused_pre_selatt_layernorm)
+        self.self_attn = MultiHeadAttention(
+            d_key,
+            d_value,
+            d_model,
+            n_head,
+            attention_dropout,
+            reused_query_fc=reused_multihead_att_weights["reused_query_fc"],
+            reused_key_fc=reused_multihead_att_weights["reused_key_fc"],
+            reused_value_fc=reused_multihead_att_weights["reused_value_fc"],
+            reused_proj_fc=reused_multihead_att_weights["reused_proj_fc"])
+        self.postprocesser1 = PrePostProcessLayer(
+            postprocess_cmd, d_model, prepostprocess_dropout,
+            reused_post_selfatt_layernorm)
+
+        self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout,
+                                                 reused_pre_ffn_layernorm)
+        self.ffn = FFN(d_inner_hid,
+                       d_model,
+                       relu_dropout,
+                       fc1_act=ffn_fc1_act,
+                       reused_fc1=reused_ffn_weights["reused_fc1"],
+                       reused_fc2=reused_ffn_weights["reused_fc2"])
+        self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
+                                                  prepostprocess_dropout,
+                                                  reused_post_ffn_layernorm)
+
+    def forward(self, enc_input, attn_bias):
+        attn_output = self.self_attn(
+            self.preprocesser1(enc_input), None, None, attn_bias)
+        attn_output = self.postprocesser1(attn_output, enc_input)
+
+        ffn_output = self.ffn(self.preprocesser2(attn_output))
+        ffn_output = self.postprocesser2(ffn_output, attn_output)
+        return ffn_output
+
+
+class TransformerEncoder(Layer):
+    """
+    encoder
+    """
+
+    def __init__(self,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da",
+                 ffn_fc1_act="relu"):
+
+        super(TransformerEncoder, self).__init__()
+
+        self.encoder_layers = list()
+        for i in range(n_layer):
+            self.encoder_layers.append(
+                self.add_sublayer(
+                    "layer_%d" % i,
+                    TransformerEncoderLayer(
+                        n_head,
+                        d_key,
+                        d_value,
+                        d_model,
+                        d_inner_hid,
+                        prepostprocess_dropout,
+                        attention_dropout,
+                        relu_dropout,
+                        preprocess_cmd,
+                        postprocess_cmd,
+                        ffn_fc1_act=ffn_fc1_act)))
+        self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
+                                             prepostprocess_dropout)
+
+    def forward(self, enc_input, attn_bias):
+        for encoder_layer in self.encoder_layers:
+            enc_output = encoder_layer(enc_input, attn_bias)
+            enc_input = enc_output
+
+        return self.processer(enc_output)
+
+
+class TransformerDecoderLayer(Layer):
+    """
+    decoder
+    """
+
+    def __init__(self,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da",
+                 reused_pre_selfatt_layernorm=None,
+                 reused_self_multihead_att_weights={
+                     "reused_query_fc": None,
+                     "reused_key_fc": None,
+                     "reused_value_fc": None,
+                     "reused_proj_fc": None
+                 },
+                 reused_post_selfatt_layernorm=None,
+                 reused_pre_crossatt_layernorm=None,
+                 reused_cross_multihead_att_weights={
+                     "reused_query_fc": None,
+                     "reused_key_fc": None,
+                     "reused_value_fc": None,
+                     "reused_proj_fc": None
+                 },
+                 reused_post_crossatt_layernorm=None,
+                 reused_pre_ffn_layernorm=None,
+                 reused_ffn_weights={"reused_fc1": None,
+                                     "reused_fc2": None},
+                 reused_post_ffn_layernorm=None):
+        super(TransformerDecoderLayer, self).__init__()
+
+        self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout,
+                                                 reused_pre_selfatt_layernorm)
+        self.self_attn = MultiHeadAttention(
+            d_key,
+            d_value,
+            d_model,
+            n_head,
+            attention_dropout,
+            reused_query_fc=reused_self_multihead_att_weights[
+                "reused_query_fc"],
+            reused_key_fc=reused_self_multihead_att_weights["reused_key_fc"],
+            reused_value_fc=reused_self_multihead_att_weights[
+                "reused_value_fc"],
+            reused_proj_fc=reused_self_multihead_att_weights["reused_proj_fc"])
+        self.postprocesser1 = PrePostProcessLayer(
+            postprocess_cmd, d_model, prepostprocess_dropout,
+            reused_post_selfatt_layernorm)
+
+        self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout,
+                                                 reused_pre_crossatt_layernorm)
+        self.cross_attn = MultiHeadAttention(
+            d_key,
+            d_value,
+            d_model,
+            n_head,
+            attention_dropout,
+            reused_query_fc=reused_cross_multihead_att_weights[
+                "reused_query_fc"],
+            reused_key_fc=reused_cross_multihead_att_weights["reused_key_fc"],
+            reused_value_fc=reused_cross_multihead_att_weights[
+                "reused_value_fc"],
+            reused_proj_fc=reused_cross_multihead_att_weights[
+                "reused_proj_fc"])
+        self.postprocesser2 = PrePostProcessLayer(
+            postprocess_cmd, d_model, prepostprocess_dropout,
+            reused_post_crossatt_layernorm)
+
+        self.preprocesser3 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout,
+                                                 reused_pre_ffn_layernorm)
+        self.ffn = FFN(d_inner_hid,
+                       d_model,
+                       relu_dropout,
+                       reused_fc1=reused_ffn_weights["reused_fc1"],
+                       reused_fc2=reused_ffn_weights["reused_fc2"])
+        self.postprocesser3 = PrePostProcessLayer(postprocess_cmd, d_model,
+                                                  prepostprocess_dropout,
+                                                  reused_post_ffn_layernorm)
+
+    def forward(self,
+                dec_input,
+                enc_output,
+                self_attn_bias,
+                cross_attn_bias,
+                cache=None):
+        self_attn_output = self.self_attn(
+            self.preprocesser1(dec_input), None, None, self_attn_bias, cache)
+        self_attn_output = self.postprocesser1(self_attn_output, dec_input)
+
+        cross_attn_output = self.cross_attn(
+            self.preprocesser2(self_attn_output), enc_output, enc_output,
+            cross_attn_bias, cache)
+        cross_attn_output = self.postprocesser2(cross_attn_output,
+                                                self_attn_output)
+
+        ffn_output = self.ffn(self.preprocesser3(cross_attn_output))
+        ffn_output = self.postprocesser3(ffn_output, cross_attn_output)
+
+        return ffn_output
+
+
+class TransformerDecoder(Layer):
+    """
+    decoder
+    """
+
+    def __init__(self, n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
+                 prepostprocess_dropout, attention_dropout, relu_dropout,
+                 preprocess_cmd, postprocess_cmd):
+        super(TransformerDecoder, self).__init__()
+
+        self.decoder_layers = list()
+        for i in range(n_layer):
+            self.decoder_layers.append(
+                self.add_sublayer(
+                    "layer_%d" % i,
+                    TransformerDecoderLayer(
+                        n_head, d_key, d_value, d_model, d_inner_hid,
+                        prepostprocess_dropout, attention_dropout,
+                        relu_dropout, preprocess_cmd, postprocess_cmd)))
+        self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
+                                             prepostprocess_dropout)
+
+    def forward(self,
+                dec_input,
+                enc_output,
+                self_attn_bias,
+                cross_attn_bias,
+                caches=None):
+        for i, decoder_layer in enumerate(self.decoder_layers):
+            dec_output = decoder_layer(dec_input, enc_output, self_attn_bias,
+                                       cross_attn_bias, None
+                                       if caches is None else caches[i])
+            dec_input = dec_output
+
+        return self.processer(dec_output)
+
+    def prepare_static_cache(self, enc_output):
+        return [
+            dict(
+                zip(("static_k", "static_v"),
+                    decoder_layer.cross_attn.cal_kv(enc_output, enc_output)))
+            for decoder_layer in self.decoder_layers
+        ]
+
+
+class DynamicGRU(fluid.dygraph.Layer):
+    def __init__(self,
+                 size,
+                 h_0=None,
+                 param_attr=None,
+                 bias_attr=None,
+                 is_reverse=False,
+                 gate_activation='sigmoid',
+                 candidate_activation='tanh',
+                 origin_mode=False,
+                 init_size=None):
+        super(DynamicGRU, self).__init__()
+
+        self.gru_unit = GRUUnit(
+            size * 3,
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            activation=candidate_activation,
+            gate_activation=gate_activation,
+            origin_mode=origin_mode)
+
+        self.size = size
+        self.h_0 = h_0
+        self.is_reverse = is_reverse
+
+    def forward(self, inputs):
+        hidden = self.h_0
+        res = []
+
+        for i in range(inputs.shape[1]):
+            if self.is_reverse:
+                i = inputs.shape[1] - 1 - i
+            input_ = inputs[:, i:i + 1, :]
+            input_ = fluid.layers.reshape(
+                input_, [-1, input_.shape[2]], inplace=False)
+            hidden, reset, gate = self.gru_unit(input_, hidden)
+            hidden_ = fluid.layers.reshape(
+                hidden, [-1, 1, hidden.shape[1]], inplace=False)
+            res.append(hidden_)
+        if self.is_reverse:
+            res = res[::-1]
+        res = fluid.layers.concat(res, axis=1)
+        return res
+
+
+class BiGRU(fluid.dygraph.Layer):
+    def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None):
+        super(BiGRU, self).__init__()
+
+        self.pre_gru = Linear(
+            input_dim=input_dim,
+            output_dim=grnn_hidden_dim * 3,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Uniform(
+                    low=-init_bound, high=init_bound),
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=1e-4)))
+
+        self.gru = DynamicGRU(
+            size=grnn_hidden_dim,
+            h_0=h_0,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Uniform(
+                    low=-init_bound, high=init_bound),
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=1e-4)))
+
+        self.pre_gru_r = Linear(
+            input_dim=input_dim,
+            output_dim=grnn_hidden_dim * 3,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Uniform(
+                    low=-init_bound, high=init_bound),
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=1e-4)))
+
+        self.gru_r = DynamicGRU(
+            size=grnn_hidden_dim,
+            is_reverse=True,
+            h_0=h_0,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Uniform(
+                    low=-init_bound, high=init_bound),
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=1e-4)))
+
+    def forward(self, input_feature):
+        res_pre_gru = self.pre_gru(input_feature)
+        res_gru = self.gru(res_pre_gru)
+        res_pre_gru_r = self.pre_gru_r(input_feature)
+        res_gru_r = self.gru_r(res_pre_gru_r)
+        bi_merge = fluid.layers.concat(input=[res_gru, res_gru_r], axis=-1)
+        return bi_merge
+
+
+class Linear_chain_crf(fluid.dygraph.Layer):
+    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
+        super(Linear_chain_crf, self).__init__()
+
+        self._param_attr = param_attr
+        self._dtype = dtype
+        self._size = size
+        self._is_test = is_test
+        self._transition = self.create_parameter(
+            attr=self._param_attr,
+            shape=[self._size + 2, self._size],
+            dtype=self._dtype)
+
+    @property
+    def weight(self):
+        return self._transition
+
+    @weight.setter
+    def weight(self, value):
+        self._transition = value
+
+    def forward(self, input, label, length=None):
+
+        alpha = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+        emission_exps = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+        transition_exps = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+        log_likelihood = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+        this_inputs = {
+            "Emission": [input],
+            "Transition": self._transition,
+            "Label": [label]
+        }
+        if length:
+            this_inputs['Length'] = [length]
+        self._helper.append_op(
+            type='linear_chain_crf',
+            inputs=this_inputs,
+            outputs={
+                "Alpha": [alpha],
+                "EmissionExps": [emission_exps],
+                "TransitionExps": transition_exps,
+                "LogLikelihood": log_likelihood
+            },
+            attrs={"is_test": self._is_test, })
+        return log_likelihood
+
+
+class Crf_decoding(fluid.dygraph.Layer):
+    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
+        super(Crf_decoding, self).__init__()
+
+        self._dtype = dtype
+        self._size = size
+        self._is_test = is_test
+        self._param_attr = param_attr
+        self._transition = self.create_parameter(
+            attr=self._param_attr,
+            shape=[self._size + 2, self._size],
+            dtype=self._dtype)
+
+    @property
+    def weight(self):
+        return self._transition
+
+    @weight.setter
+    def weight(self, value):
+        self._transition = value
+
+    def forward(self, input, label=None, length=None):
+
+        viterbi_path = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+        this_inputs = {
+            "Emission": [input],
+            "Transition": self._transition,
+            "Label": label
+        }
+        if length:
+            this_inputs['Length'] = [length]
+        self._helper.append_op(
+            type='crf_decoding',
+            inputs=this_inputs,
+            outputs={"ViterbiPath": [viterbi_path]},
+            attrs={"is_test": self._is_test, })
+        return viterbi_path
+
+
+class SequenceTagging(fluid.dygraph.Layer):
+    def __init__(self,
+                 vocab_size,
+                 num_labels,
+                 batch_size,
+                 word_emb_dim=128,
+                 grnn_hidden_dim=128,
+                 emb_learning_rate=0.1,
+                 crf_learning_rate=0.1,
+                 bigru_num=2,
+                 init_bound=0.1,
+                 length=None):
+        super(SequenceTagging, self).__init__()
+        """
+        define the sequence tagging network structure
+        word: stores the input of the model
+        for_infer: a boolean value, indicating if the model to be created is for training or predicting.
+
+        return:
+            for infer: return the prediction
+            otherwise: return the prediction
+        """
+        self.word_emb_dim = word_emb_dim
+        self.vocab_size = vocab_size
+        self.num_labels = num_labels
+        self.grnn_hidden_dim = grnn_hidden_dim
+        self.emb_lr = emb_learning_rate
+        self.crf_lr = crf_learning_rate
+        self.bigru_num = bigru_num
+        self.batch_size = batch_size
+        self.init_bound = 0.1
+
+        self.word_embedding = Embedding(
+            size=[self.vocab_size, self.word_emb_dim],
+            dtype='float32',
+            param_attr=fluid.ParamAttr(
+                learning_rate=self.emb_lr,
+                name="word_emb",
+                initializer=fluid.initializer.Uniform(
+                    low=-self.init_bound, high=self.init_bound)))
+
+        h_0 = fluid.layers.create_global_var(
+            shape=[self.batch_size, self.grnn_hidden_dim],
+            value=0.0,
+            dtype='float32',
+            persistable=True,
+            force_cpu=True,
+            name='h_0')
+
+        self.bigru_units = []
+        for i in range(self.bigru_num):
+            if i == 0:
+                self.bigru_units.append(
+                    self.add_sublayer(
+                        "bigru_units%d" % i,
+                        BiGRU(
+                            self.grnn_hidden_dim,
+                            self.grnn_hidden_dim,
+                            self.init_bound,
+                            h_0=h_0)))
+            else:
+                self.bigru_units.append(
+                    self.add_sublayer(
+                        "bigru_units%d" % i,
+                        BiGRU(
+                            self.grnn_hidden_dim * 2,
+                            self.grnn_hidden_dim,
+                            self.init_bound,
+                            h_0=h_0)))
+
+        self.fc = Linear(
+            input_dim=self.grnn_hidden_dim * 2,
+            output_dim=self.num_labels,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Uniform(
+                    low=-self.init_bound, high=self.init_bound),
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=1e-4)))
+
+        self.linear_chain_crf = Linear_chain_crf(
+            param_attr=fluid.ParamAttr(
+                name='linear_chain_crfw', learning_rate=self.crf_lr),
+            size=self.num_labels)
+
+        self.crf_decoding = Crf_decoding(
+            param_attr=fluid.ParamAttr(
+                name='crfw', learning_rate=self.crf_lr),
+            size=self.num_labels)
+
+    def forward(self, word, target, lengths):
+        """
+        Configure the network
+        """
+        word_embed = self.word_embedding(word)
+        input_feature = word_embed
+
+        for i in range(self.bigru_num):
+            bigru_output = self.bigru_units[i](input_feature)
+            input_feature = bigru_output
+
+        emission = self.fc(bigru_output)
+
+        crf_cost = self.linear_chain_crf(
+            input=emission, label=target, length=lengths)
+        avg_cost = fluid.layers.mean(x=crf_cost)
+        self.crf_decoding.weight = self.linear_chain_crf.weight
+        crf_decode = self.crf_decoding(input=emission, length=lengths)
+        return crf_decode, avg_cost, lengths
--- a/hapi/text/tokenizer/__init__.py
+++ b/hapi/text/tokenizer/__init__.py
--- a/hapi/text/tokenizer/tokenization.py
+++ b/hapi/text/tokenizer/tokenization.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import unicodedata
+import six
+import io
+
+
+def convert_to_unicode(text):
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text.decode("utf-8", "ignore")
+        elif isinstance(text, unicode):
+            return text
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+
+
+def printable_text(text):
+    """Returns text encoded in a way suitable for print or `tf.logging`."""
+
+    # These functions want `str` for both Python2 and Python3, but in one case
+    # it's a Unicode string and in the other it's a byte string.
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, unicode):
+            return text.encode("utf-8")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    fin = io.open(vocab_file, encoding="utf8")
+    for num, line in enumerate(fin):
+        items = convert_to_unicode(line.strip()).split("\t")
+        if len(items) > 2:
+            break
+        token = items[0]
+        index = items[1] if len(items) == 2 else num
+        token = token.strip()
+        vocab[token] = int(index)
+    return vocab
+
+
+def convert_by_vocab(vocab, items):
+    """Converts a sequence of [tokens|ids] using the vocab."""
+    output = []
+    for item in items:
+        output.append(vocab[item])
+    return output
+
+
+def convert_tokens_to_ids(vocab, tokens):
+    return convert_by_vocab(vocab, tokens)
+
+
+def convert_ids_to_tokens(inv_vocab, ids):
+    return convert_by_vocab(inv_vocab, ids)
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a peice of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class FullTokenizer(object):
+    """Runs end-to-end tokenziation."""
+
+    def __init__(self, vocab_file, do_lower_case=True):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+    def tokenize(self, text):
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+
+
+class CharTokenizer(object):
+    """Runs end-to-end tokenziation."""
+
+    def __init__(self, vocab_file, do_lower_case=True):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+    def tokenize(self, text):
+        split_tokens = []
+        for token in text.lower().split(" "):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+
+
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+    def __init__(self, do_lower_case=True):
+        """Constructs a BasicTokenizer.
+
+        Args:
+            do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = convert_to_unicode(text)
+        text = self._clean_text(text)
+
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #     https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+            (cp >= 0x3400 and cp <= 0x4DBF) or  #
+            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+            (cp >= 0x2B820 and cp <= 0x2CEAF) or
+            (cp >= 0xF900 and cp <= 0xFAFF) or  #
+            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenziation."""
+
+    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+
+        For example:
+            input = "unaffable"
+            output = ["un", "##aff", "##able"]
+
+        Args:
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through `BasicTokenizer.
+
+        Returns:
+            A list of wordpiece tokens.
+        """
+
+        text = convert_to_unicode(text)
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+        (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
--- a/setup.cfg
+++ b/setup.cfg
+[metadata]
+
+name = hapi
+
+author = zhouxiangyang
+author_email = zhouxiangyang@baidu.com
+
+version = 0.0.1
+
+description = HAPI
+long_description = file: README.md
+long_description_content_type = text/markdown
+
+home_page = https://github.com/PaddlePaddle/hapi
+license = Apache 2.0
+
+classifier =
+    Private :: Do Not Upload
+    Programming Language :: Python
+    Programming Language :: Python :: 2
+    Programming Language :: Python :: 2.7
+    Programming Language :: Python :: 3
+    Programming Language :: Python :: 3.5
+    Programming Language :: Python :: 3.6
+    Programming Language :: Python :: 3.7
+
+keywords =
+    paddlepaddle
+    paddle
+    high-level-api
+
+[options]
+
+packages = find:
+
+#install_requires =
+#    paddlepaddle-gpu >= 1.5.2
+
+include_package_data = True
+zip_safe = False
+
+[sdist]
+dist_dir = output/dist
+
+[bdist_wheel]
+dist_dir = output/dist
+
+[easy_install]
+index_url = http://pip.baidu.com/root/baidu/+simple/
+
+
+
--- a/setup.py
+++ b/setup.py
+# -*- coding: UTF-8 -*-
+################################################################################
+#
+#   Copyright (c) 2019  Baidu.com, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+"""
+Setup script.
+Authors: zhouxiangyang(zhouxiangyang@baidu.com)
+Date:    2020/2/4 00:00:01
+"""
+import setuptools
+with open("README.md", "r") as fh:
+    long_description = fh.read()
+setuptools.setup(
+    name="hapi",
+    version="0.0.1",
+    author="PaddlePaddle",
+    author_email="zhouxiangyang@baidu.com",
+    description="A Paddle High-level API that supports both static and dynamic execution modes (still under development)",
+    # long_description=long_description,
+    # long_description_content_type="text/markdown",
+    url="https://github.com/PaddlePaddle/hapi",
+    # packages=setuptools.find_packages(),
+    packages=[
+        'hapi', 'hapi.text', 'hapi.text.tokenizer', 'hapi.text.bert',
+        'hapi.text.bert.utils'
+    ],
+    package_dir={
+        'hapi': './hapi',
+        'hapi.text': './hapi/text',
+        'hapi.text.tokenizer': './hapi/text/tokenizer',
+        'hapi.text.bert': './hapi/text/bert',
+        'hapi.text.bert.utils': './hapi/text/bert/utils',
+    },
+    platforms="any",
+    license='Apache 2.0',
+    classifiers=[
+        'License :: OSI Approved :: Apache Software License',
+        'Programming Language :: Python',
+        'Programming Language :: Python :: 2',
+        'Programming Language :: Python :: 2.7',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
+    ], )
--- a/tests/test_bert_dataloader.py
+++ b/tests/test_bert_dataloader.py
+import paddle
+from hapi.model import set_device
+from hapi.text.bert.dataloader import SingleSentenceDataLoader
+import hapi.text.tokenizer.tokenization as tokenization
+
+device = set_device("cpu")
+paddle.fluid.enable_dygraph(device)
+
+tokenizer = tokenization.FullTokenizer(
+    vocab_file="./tmp/hapi/data/pretrained_models/uncased_L-12_H-768_A-12/vocab.txt",
+    do_lower_case=True)
+
+bert_dataloader = SingleSentenceDataLoader(
+    "./tmp/hapi/aaa.txt",
+    tokenizer, ["1", "2"],
+    max_seq_length=32,
+    batch_size=1)
+
+for data in bert_dataloader.dataloader():
+    print(data)