add bert, refine text.py

601db3ab · xyzhou-puck · d90f7cc6 · 601db3ab · 601db3ab · 601db3ab
29 changed file
--- a/bert/bert_classifier.py
+++ b/bert/bert_classifier.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT fine-tuning in Paddle Dygraph Mode."""
+import os
+import io
+import time
+import argparse
+import paddle.fluid as fluid
+from paddle.fluid.dygraph import to_variable
+from hapi.text.text import PrePostProcessLayer
+from hapi.text.bert.bert import BertConfig
+from cls import ClsModelLayer
+from hapi.text.bert.optimization import Optimizer
+from hapi.text.bert.utils.args import ArgumentGroup, print_arguments, check_cuda
+from hapi.model import set_device, Model, SoftmaxWithCrossEntropy, Input
+from hapi.metrics import Accuracy
+from hapi.text.bert.dataloader import SingleSentenceDataLoader, BertInputExample
+import hapi.text.tokenizer.tokenization as tokenization
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+model_g = ArgumentGroup(parser, "model", "model configuration and paths.")
+model_g.add_arg("bert_config_path",      str,  "./config/bert_config.json",  "Path to the json file for bert model config.")
+model_g.add_arg("init_checkpoint",       str,  None,                         "Init checkpoint to resume training from.")
+model_g.add_arg("init_pretraining_params",  str,  None,
+                "Init pre-training params which preforms fine-tuning from. If the "
+                "arg 'init_checkpoint' has been set, this argument wouldn't be valid.")
+model_g.add_arg("checkpoints",           str,  "checkpoints",                "Path to save checkpoints.")
+train_g = ArgumentGroup(parser, "training", "training options.")
+train_g.add_arg("epoch",             int,    100,     "Number of epoches for training.")
+train_g.add_arg("learning_rate",     float,  0.0001,  "Learning rate used to train with warmup.")
+train_g.add_arg("lr_scheduler",      str,    "linear_warmup_decay",
+                "scheduler of learning rate.", choices=['linear_warmup_decay', 'noam_decay'])
+train_g.add_arg("weight_decay",      float,  0.01,    "Weight decay rate for L2 regularizer.")
+train_g.add_arg("warmup_proportion",     float,  0.1,                         "Proportion of training steps to perform linear learning rate warmup for.")
+train_g.add_arg("save_steps",        int,    10000,   "The steps interval to save checkpoints.")
+train_g.add_arg("validation_steps",  int,    1000,    "The steps interval to evaluate model performance.")
+train_g.add_arg("loss_scaling",      float,  1.0,
+                "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.")
+log_g = ArgumentGroup(parser,     "logging", "logging related.")
+log_g.add_arg("skip_steps",          int,    10,    "The steps interval to print loss.")
+data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options")
+data_g.add_arg("data_dir",            str,  None,       "Path to training data.")
+data_g.add_arg("vocab_path",          str,  None,       "Vocabulary path.")
+data_g.add_arg("max_seq_len",         int,  512,                   "Tokens' number of the longest seqence allowed.")
+data_g.add_arg("batch_size",          int,  32,
+               "The total number of examples in one batch for training, see also --in_tokens.")
+data_g.add_arg("in_tokens",           bool, False,
+               "If set, the batch size will be the maximum number of tokens in one batch. "
+               "Otherwise, it will be the maximum number of examples in one batch.")
+data_g.add_arg("do_lower_case", bool, True,
+               "Whether to lower case the input text. Should be True for uncased models and False for cased models.")
+data_g.add_arg("random_seed",   int,  5512,     "Random seed.")
+run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
+run_type_g.add_arg("use_cuda",                     bool,   True,   "If set, use GPU for training.")
+run_type_g.add_arg("shuffle",                      bool,   True,  "")
+run_type_g.add_arg("task_name",                    str,    None,
+                   "The name of task to perform fine-tuning, should be in {'xnli', 'mnli', 'cola', 'mrpc'}.")
+run_type_g.add_arg("do_train",                     bool,   True,  "Whether to perform training.")
+run_type_g.add_arg("do_test",                      bool,   False,  "Whether to perform evaluation on test data set.")
+run_type_g.add_arg("use_data_parallel", bool, False,  "The flag indicating whether to shuffle instances in each pass.")
+run_type_g.add_arg("enable_ce", bool, False,  help="The flag indicating whether to run the task for continuous evaluation.")
+args = parser.parse_args()
+def create_data(batch):
+    """
+    convert data to variable
+    """
+    src_ids = to_variable(batch[0], "src_ids")
+    position_ids = to_variable(batch[1], "position_ids")
+    sentence_ids = to_variable(batch[2], "sentence_ids")
+    input_mask = to_variable(batch[3], "input_mask")
+    labels = to_variable(batch[4], "labels")
+    labels.stop_gradient = True
+    return src_ids, position_ids, sentence_ids, input_mask, labels
+def train(args):
+    device = set_device("gpu" if args.use_cuda else "cpu")
+    fluid.enable_dygraph(device)
+    bert_config = BertConfig(args.bert_config_path)
+    bert_config.print_config()
+    if not (args.do_train or args.do_test):
+        raise ValueError("For args `do_train`, `do_test`, at "
+                        "least one of them must be True.")
+    trainer_count = fluid.dygraph.parallel.Env().nranks
+    tokenizer = tokenization.FullTokenizer(
+        vocab_file=args.vocab_path, do_lower_case=args.do_lower_case)
+    def mnli_line_processor(line_id, line):
+        if line_id == "0":
+            return None
+        uid = tokenization.convert_to_unicode(line[0])
+        text_a = tokenization.convert_to_unicode(line[8])
+        text_b = tokenization.convert_to_unicode(line[9])
+        label = tokenization.convert_to_unicode(line[-1])
+        if label not in ["contradiction", "entailment", "neutral"]:
+            label = "contradiction"
+        return BertInputExample(uid=uid, text_a=text_a, text_b=text_b, label=label)
+    bert_dataloader = SingleSentenceDataLoader("./data/glue_data/MNLI/train.tsv", tokenizer, ["contradiction", "entailment", "neutral"],
+        max_seq_length=64, batch_size=32, line_processor=mnli_line_processor)
+    num_train_examples = len(bert_dataloader.dataset)
+    max_train_steps = args.epoch * num_train_examples // args.batch_size // trainer_count
+    warmup_steps = int(max_train_steps * args.warmup_proportion)
+    print("Trainer count: %d" % trainer_count)
+    print("Num train examples: %d" % num_train_examples)
+    print("Max train steps: %d" % max_train_steps)
+    print("Num warmup steps: %d" % warmup_steps)
+    if args.use_data_parallel:
+        strategy = fluid.dygraph.parallel.prepare_context()
+    inputs = [Input([None, None], 'int64', name='src_ids'),
+              Input([None, None], 'int64', name='pos_ids'),
+              Input([None, None], 'int64', name='sent_ids'),
+              Input([None, None], 'float32', name='input_mask')]
+    labels = [Input([None, 1], 'int64', name='label')]
+    cls_model = ClsModelLayer(
+                            args,
+                            bert_config,
+                            3,
+                            is_training=True,
+                            return_pooled_out=True)
+    optimizer = Optimizer(
+                    warmup_steps=warmup_steps,
+                    num_train_steps=max_train_steps,
+                    learning_rate=args.learning_rate,
+                    model_cls=cls_model,
+                    weight_decay=args.weight_decay,
+                    scheduler=args.lr_scheduler,
+                    loss_scaling=args.loss_scaling,
+                    parameter_list=cls_model.parameters())
+    cls_model.prepare(
+        optimizer,
+        SoftmaxWithCrossEntropy(),
+        Accuracy(topk=(1, 2)),
+        inputs,
+        labels,
+        device=device)
+    cls_model.bert_layer.init_parameters(args.init_pretraining_params, verbose=True)
+    cls_model.fit(train_data=bert_dataloader.dataloader, epochs=args.epoch)
+    return cls_model
+if __name__ == '__main__':
+    print_arguments(args)
+    check_cuda(args.use_cuda)
+    if args.do_train:
+        cls_model = train(args)
--- a/bert/cls.py
+++ b/bert/cls.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"dygraph transformer layers"
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import six
+import json
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph import Linear, Layer
+from hapi.text.bert.bert import BertEncoder
+from hapi.model import Model
+class ClsModelLayer(Model):
+    """
+    classify model
+    """
+    def __init__(self,
+                 args,
+                 config,
+                 num_labels,
+                 is_training=True,
+                 return_pooled_out=True,
+                 use_fp16=False):
+        super(ClsModelLayer, self).__init__()
+        self.config = config
+        self.is_training = is_training
+        self.use_fp16 = use_fp16
+        self.loss_scaling = args.loss_scaling
+        self.bert_layer = BertEncoder(
+            config=self.config, return_pooled_out=True, use_fp16=self.use_fp16)
+        self.cls_fc = Linear(
+            input_dim=self.config["hidden_size"],
+            output_dim=num_labels,
+            param_attr=fluid.ParamAttr(
+                name="cls_out_w",
+                initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
+            bias_attr=fluid.ParamAttr(
+                name="cls_out_b", initializer=fluid.initializer.Constant(0.)))
+    def forward(self, src_ids, position_ids, sentence_ids, input_mask):
+        """
+        forward
+        """
+        #src_ids = data_ids[0]
+        #position_ids = data_ids[1]
+        #sentence_ids = data_ids[2]
+        #input_mask = data_ids[3]
+        #labels = data_ids[4]
+        enc_output, next_sent_feat = self.bert_layer(src_ids, position_ids,
+                                                     sentence_ids, input_mask)
+        cls_feats = fluid.layers.dropout(
+            x=next_sent_feat,
+            dropout_prob=0.1,
+            dropout_implementation="upscale_in_train")
+        logits = self.cls_fc(cls_feats)
+        return logits
+        """
+        logits = self.cls_fc(cls_feats)
+        ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
+            logits=logits, label=labels, return_softmax=True)
+        loss = fluid.layers.mean(x=ce_loss)
+        if self.use_fp16 and self.loss_scaling > 1.0:
+            loss *= self.loss_scaling
+        num_seqs = fluid.layers.create_tensor(dtype='int64')
+        accuracy = fluid.layers.accuracy(
+            input=probs, label=labels, total=num_seqs)
+        """
+        return loss, accuracy
--- a/bert/run_classifier_single_gpu.sh
+++ b/bert/run_classifier_single_gpu.sh
+#!/bin/bash
+BERT_BASE_PATH="./data/pretrained_models/uncased_L-12_H-768_A-12/"
+TASK_NAME='MNLI'
+DATA_PATH="./data/glue_data/MNLI/"
+CKPT_PATH="./data/saved_model/mnli_models"
+export CUDA_VISIBLE_DEVICES=0
+# start fine-tuning
+python3.7 bert_classifier.py\
+    --task_name ${TASK_NAME} \
+    --use_cuda true \
+    --do_train true \
+    --do_test true \
+    --batch_size 64 \
+    --init_pretraining_params ${BERT_BASE_PATH}/dygraph_params/ \
+    --data_dir ${DATA_PATH} \
+    --vocab_path ${BERT_BASE_PATH}/vocab.txt \
+    --checkpoints ${CKPT_PATH} \
+    --save_steps 1000 \
+    --weight_decay  0.01 \
+    --warmup_proportion 0.1 \
+    --validation_steps 100 \
+    --epoch 3 \
+    --max_seq_len 128 \
+    --bert_config_path ${BERT_BASE_PATH}/bert_config.json \
+    --learning_rate 5e-5 \
+    --skip_steps 10 \
+    --shuffle true
--- a/hapi/__init__.py
+++ b/hapi/__init__.py
--- a/hapi/callbacks.py
+++ b/hapi/callbacks.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import six
+import copy
+from hapi.progressbar import ProgressBar
+from paddle.fluid.dygraph.parallel import ParallelEnv
+def config_callbacks(callbacks=None,
+                     model=None,
+                     batch_size=None,
+                     epochs=None,
+                     steps=None,
+                     log_freq=2,
+                     verbose=2,
+                     save_freq=1,
+                     save_dir=None,
+                     metrics=None,
+                     mode='train'):
+    cbks = callbacks or []
+    cbks = cbks if isinstance(cbks, (list, tuple)) else [cbks]
+    if not any(isinstance(k, ProgBarLogger) for k in cbks) and verbose:
+        cbks = cbks + [ProgBarLogger(log_freq, verbose=verbose)]
+    if not any(isinstance(k, ModelCheckpoint) for k in cbks):
+        cbks = cbks + [ModelCheckpoint(save_freq, save_dir)]
+    cbk_list = CallbackList(cbks)
+    cbk_list.set_model(model)
+    metrics = metrics or [] if mode != 'test' else []
+    params = {
+        'batch_size': batch_size,
+        'epochs': epochs,
+        'steps': steps,
+        'verbose': verbose,
+        'metrics': metrics,
+    }
+    cbk_list.set_params(params)
+    return cbk_list
+class CallbackList(object):
+    def __init__(self, callbacks=None):
+        # copy
+        self.callbacks = [c for c in callbacks]
+        self.params = {}
+        self.model = None
+    def append(self, callback):
+        self.callbacks.append(callback)
+    def __iter__(self):
+        return iter(self.callbacks)
+    def set_params(self, params):
+        for c in self.callbacks:
+            c.set_params(params)
+    def set_model(self, model):
+        for c in self.callbacks:
+            c.set_model(model)
+    def _call(self, name, *args):
+        for c in self.callbacks:
+            func = getattr(c, name)
+            func(*args)
+    def _check_mode(self, mode):
+        assert mode in ['train', 'eval', 'test'], \
+            'mode should be train, eval or test'
+    def on_begin(self, mode, logs=None):
+        self._check_mode(mode)
+        name = 'on_{}_begin'.format(mode)
+        self._call(name, logs)
+    def on_end(self, mode, logs=None):
+        self._check_mode(mode)
+        name = 'on_{}_end'.format(mode)
+        self._call(name, logs)
+    def on_epoch_begin(self, epoch=None, logs=None):
+        self._call('on_epoch_begin', epoch, logs)
+    def on_epoch_end(self, epoch=None, logs=None):
+        self._call('on_epoch_end', epoch, logs)
+    def on_batch_begin(self, mode, step=None, logs=None):
+        self._check_mode(mode)
+        name = 'on_{}_batch_begin'.format(mode)
+        self._call(name, step, logs)
+    def on_batch_end(self, mode, step=None, logs=None):
+        self._check_mode(mode)
+        name = 'on_{}_batch_end'.format(mode)
+        self._call(name, step, logs)
+class Callback(object):
+    def __init__(self):
+        self.model = None
+        self.params = {}
+    def set_params(self, params):
+        self.params = params
+    def set_model(self, model):
+        self.model = model
+    def on_train_begin(self, logs=None):
+        """
+        """
+    def on_train_end(self, logs=None):
+        """
+        """
+    def on_eval_begin(self, logs=None):
+        """
+        """
+    def on_eval_end(self, logs=None):
+        """
+        """
+    def on_test_begin(self, logs=None):
+        """
+        """
+    def on_test_end(self, logs=None):
+        """
+        """
+    def on_epoch_begin(self, epoch, logs=None):
+        """
+        """
+    def on_epoch_end(self, epoch, logs=None):
+        """
+        """
+    def on_train_batch_begin(self, step, logs=None):
+        """
+        """
+    def on_train_batch_end(self, step, logs=None):
+        """
+        """
+    def on_eval_batch_begin(self, step, logs=None):
+        """
+        """
+    def on_eval_batch_end(self, step, logs=None):
+        """
+        """
+    def on_eval_batch_begin(self, step, logs=None):
+        """
+        """
+    def on_eval_batch_end(self, step, logs=None):
+        """
+        """
+class ProgBarLogger(Callback):
+    def __init__(self, log_freq=1, verbose=2):
+        self.epochs = None
+        self.steps = None
+        self.progbar = None
+        self.verbose = verbose
+        self.log_freq = log_freq
+    def on_train_begin(self, logs=None):
+        self.epochs = self.params['epochs']
+        assert self.epochs
+        self.train_metrics = self.params['metrics']
+        assert self.train_metrics
+    def on_epoch_begin(self, epoch=None, logs=None):
+        self.steps = self.params['steps']
+        self.epoch = epoch
+        self.train_step = 0
+        if self.verbose and self.epochs and ParallelEnv().local_rank == 0:
+            print('Epoch %d/%d' % (epoch + 1, self.epochs))
+        self.train_progbar = ProgressBar(num=self.steps, verbose=self.verbose)
+    def _updates(self, logs, mode):
+        values = []
+        metrics = getattr(self, '%s_metrics' % (mode))
+        progbar = getattr(self, '%s_progbar' % (mode))
+        steps = getattr(self, '%s_step' % (mode))
+        for k in metrics:
+            if k in logs:
+                values.append((k, logs[k]))
+        progbar.update(steps, values)
+    def on_train_batch_end(self, step, logs=None):
+        logs = logs or {}
+        self.train_step += 1
+        if self.train_step % self.log_freq == 0 and self.verbose and ParallelEnv(
+        ).local_rank == 0:
+            # if steps is not None, last step will update in on_epoch_end
+            if self.steps and self.train_step < self.steps:
+                self._updates(logs, 'train')
+            else:
+                self._updates(logs, 'train')
+    def on_epoch_end(self, epoch, logs=None):
+        logs = logs or {}
+        if self.verbose and ParallelEnv().local_rank == 0:
+            self._updates(logs, 'train')
+    def on_eval_begin(self, logs=None):
+        self.eval_steps = logs.get('steps', None)
+        self.eval_metrics = logs.get('metrics_name', [])
+        self.eval_step = 0
+        self.evaled_samples = 0
+        self.eval_progbar = ProgressBar(
+            num=self.eval_steps, verbose=self.verbose)
+        if ParallelEnv().local_rank == 0:
+            print('Eval begin...')
+    def on_eval_batch_end(self, step, logs=None):
+        logs = logs or {}
+        self.eval_step = step
+        samples = logs.get('batch_size', 1)
+        self.evaled_samples += samples
+        if self.eval_step % self.log_freq == 0 and self.verbose and ParallelEnv(
+        ).local_rank == 0:
+            # if steps is not None, last step will update in on_epoch_end
+            if self.eval_steps and self.eval_step < self.eval_steps:
+                self._updates(logs, 'eval')
+    def on_eval_end(self, logs=None):
+        logs = logs or {}
+        if self.verbose and ParallelEnv().local_rank == 0:
+            self._updates(logs, 'eval')
+            print('Eval samples: %d' % (self.evaled_samples))
+class ModelCheckpoint(Callback):
+    def __init__(self, save_freq=1, save_dir=None):
+        self.save_freq = save_freq
+        self.save_dir = save_dir
+    def on_epoch_begin(self, epoch=None, logs=None):
+        self.epoch = epoch
+    def _is_save(self):
+        return self.model and self.save_dir and ParallelEnv().local_rank == 0
+    def on_epoch_end(self, epoch, logs=None):
+        if self._is_save() and self.epoch % self.save_freq == 0:
+            path = '{}/{}'.format(self.save_dir, epoch)
+            print('save checkpoint at {}'.format(path))
+            self.model.save(path)
+    def on_train_end(self, logs=None):
+        if self._is_save():
+            path = '{}/final'.format(self.save_dir)
+            print('save checkpoint at {}'.format(path))
+            self.model.save(path)
--- a/hapi/distributed.py
+++ b/hapi/distributed.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+import six
+import time
+import math
+import socket
+import contextlib
+import numpy as np
+from paddle import fluid
+from paddle.fluid.layers import collective
+from paddle.fluid.dygraph.parallel import ParallelEnv, ParallelStrategy
+from paddle.fluid.io import BatchSampler
+_parallel_context_initialized = False
+class DistributedBatchSampler(BatchSampler):
+    """Sampler that restricts data loading to a subset of the dataset.
+    In such case, each process can pass a DistributedBatchSampler instance 
+    as a DataLoader sampler, and load a subset of the original dataset that 
+    is exclusive to it.
+    .. note::
+        Dataset is assumed to be of constant size.
+    Args:
+        data_source: this could be a `fluid.io.Dataset` implement
+                     or other python object which implemented
+                     `__len__` for BatchSampler to get sample
+                     number of data source.
+        batch_size(int): sample indice number in a mini-batch indices.
+        shuffle(bool): whther to shuffle indices order before genrating
+            batch indices. Default False.
+        drop_last(bool): whether drop the last incomplete batch dataset size
+            is not divisible by the batch size. Default False
+    """
+    def __init__(self, dataset, batch_size, shuffle=False, drop_last=False):
+        self.dataset = dataset
+        assert isinstance(batch_size, int) and batch_size > 0, \
+                "batch_size should be a positive integer"
+        self.batch_size = batch_size
+        assert isinstance(shuffle, bool), \
+                "shuffle should be a boolean value"
+        self.shuffle = shuffle
+        assert isinstance(drop_last, bool), \
+                "drop_last should be a boolean number"
+        self.drop_last = drop_last
+        self.nranks = ParallelEnv().nranks
+        self.local_rank = ParallelEnv().local_rank
+        self.epoch = 0
+        self.num_samples = int(
+            math.ceil(len(self.dataset) * 1.0 / self.nranks))
+        self.total_size = self.num_samples * self.nranks
+    def __iter__(self):
+        num_samples = len(self.dataset)
+        indices = np.arange(num_samples).tolist()
+        indices += indices[:(self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+        if self.shuffle:
+            np.random.RandomState(self.epoch).shuffle(indices)
+            self.epoch += 1
+        # subsample
+        def _get_indices_by_batch_size(indices):
+            subsampled_indices = []
+            last_batch_size = self.total_size % (self.batch_size * self.nranks)
+            assert last_batch_size % self.nranks == 0
+            last_local_batch_size = last_batch_size // self.nranks
+            for i in range(self.local_rank * self.batch_size,
+                           len(indices) - last_batch_size,
+                           self.batch_size * self.nranks):
+                subsampled_indices.extend(indices[i:i + self.batch_size])
+            indices = indices[len(indices) - last_batch_size:]
+            subsampled_indices.extend(indices[
+                self.local_rank * last_local_batch_size:(
+                    self.local_rank + 1) * last_local_batch_size])
+            return subsampled_indices
+        if self.nranks > 1:
+            indices = _get_indices_by_batch_size(indices)
+        assert len(indices) == self.num_samples
+        _sample_iter = iter(indices)
+        batch_indices = []
+        for idx in _sample_iter:
+            batch_indices.append(idx)
+            if len(batch_indices) == self.batch_size:
+                yield batch_indices
+                batch_indices = []
+        if not self.drop_last and len(batch_indices) > 0:
+            yield batch_indices
+    def __len__(self):
+        num_samples = self.num_samples
+        num_samples += int(not self.drop_last) * (self.batch_size - 1)
+        return num_samples // self.batch_size
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+def _all_gather(x, nranks, ring_id=0, use_calc_stream=True):
+    return collective._c_allgather(
+        x, nranks, ring_id=ring_id, use_calc_stream=use_calc_stream)
+def wait_server_ready(endpoints):
+    assert not isinstance(endpoints, six.string_types)
+    while True:
+        all_ok = True
+        not_ready_endpoints = []
+        for ep in endpoints:
+            ip_port = ep.split(":")
+            with contextlib.closing(
+                    socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+                sock.settimeout(2)
+                result = sock.connect_ex((ip_port[0], int(ip_port[1])))
+                if result != 0:
+                    all_ok = False
+                    not_ready_endpoints.append(ep)
+        if not all_ok:
+            time.sleep(3)
+        else:
+            break
+def init_communicator(program, rank, nranks, wait_port, current_endpoint,
+                      endpoints):
+    if nranks < 2:
+        return
+    other_endpoints = endpoints[:]
+    other_endpoints.remove(current_endpoint)
+    if rank == 0 and wait_port:
+        wait_server_ready(other_endpoints)
+    block = program.global_block()
+    nccl_id_var = block.create_var(
+        name=fluid.unique_name.generate('nccl_id'),
+        persistable=True,
+        type=fluid.core.VarDesc.VarType.RAW)
+    block.append_op(
+        type='c_gen_nccl_id',
+        inputs={},
+        outputs={'Out': nccl_id_var},
+        attrs={
+            'rank': rank,
+            'endpoint': current_endpoint,
+            'other_endpoints': other_endpoints
+        })
+    block.append_op(
+        type='c_comm_init',
+        inputs={'X': nccl_id_var},
+        outputs={},
+        attrs={
+            'nranks': nranks,
+            'rank': rank,
+            'ring_id': 0,
+        })
+def prepare_distributed_context(place=None):
+    if place is None:
+        place = fluid.CUDAPlace(ParallelEnv().dev_id) if ParallelEnv().nranks > 1 \
+            else fluid.CUDAPlace(0)
+    strategy = ParallelStrategy()
+    strategy.nranks = ParallelEnv().nranks
+    strategy.local_rank = ParallelEnv().local_rank
+    strategy.trainer_endpoints = ParallelEnv().trainer_endpoints
+    strategy.current_endpoint = ParallelEnv().current_endpoint
+    if strategy.nranks < 2:
+        return
+    global _parallel_context_initialized
+    if not _parallel_context_initialized and isinstance(place,
+                                                        fluid.CUDAPlace):
+        def _init_context():
+            communicator_prog = fluid.Program()
+            init_communicator(communicator_prog, strategy.local_rank,
+                              strategy.nranks, True, strategy.current_endpoint,
+                              strategy.trainer_endpoints)
+            exe = fluid.Executor(place)
+            exe.run(communicator_prog)
+        if fluid.in_dygraph_mode():
+            fluid.disable_dygraph()
+            _init_context()
+            fluid.enable_dygraph(place)
+        else:
+            _init_context()
+    else:
+        assert ("Only support CUDAPlace for now.")
+    _parallel_context_initialized = True
+    return strategy
--- a/hapi/metrics.py
+++ b/hapi/metrics.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+import six
+import abc
+import numpy as np
+import paddle.fluid as fluid
+import logging
+FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
+logging.basicConfig(level=logging.INFO, format=FORMAT)
+logger = logging.getLogger(__name__)
+__all__ = ['Metric', 'Accuracy']
+@six.add_metaclass(abc.ABCMeta)
+class Metric(object):
+    """
+    Base class for metric, encapsulates metric logic and APIs
+    Usage:
+    m = SomeMetric()
+    for prediction, label in ...:
+        m.update(prediction, label)
+    m.accumulate()
+    """
+    @abc.abstractmethod
+    def reset(self):
+        """
+        Reset states and result
+        """
+        raise NotImplementedError("function 'reset' not implemented in {}.".
+                                  format(self.__class__.__name__))
+    @abc.abstractmethod
+    def update(self, *args, **kwargs):
+        """
+        Update states for metric
+        """
+        raise NotImplementedError("function 'update' not implemented in {}.".
+                                  format(self.__class__.__name__))
+    @abc.abstractmethod
+    def accumulate(self):
+        """
+        Accumulates statistics, computes and returns the metric value
+        """
+        raise NotImplementedError(
+            "function 'accumulate' not implemented in {}.".format(
+                self.__class__.__name__))
+    @abc.abstractmethod
+    def name(self):
+        """
+        Returns metric name
+        """
+        raise NotImplementedError("function 'name' not implemented in {}.".
+                                  format(self.__class__.__name__))
+    def add_metric_op(self, pred, label):
+        """
+        Add process op for metric in program
+        """
+        return pred, label
+class Accuracy(Metric):
+    """
+    Encapsulates accuracy metric logic
+    """
+    def __init__(self, topk=(1, ), name=None, *args, **kwargs):
+        super(Accuracy, self).__init__(*args, **kwargs)
+        self.topk = topk
+        self.maxk = max(topk)
+        self._init_name(name)
+        self.reset()
+    def add_metric_op(self, pred, label, *args, **kwargs):
+        pred = fluid.layers.argsort(pred[0], descending=True)[1][:, :self.maxk]
+        correct = pred == label[0]
+        return correct
+    def update(self, correct, *args, **kwargs):
+        accs = []
+        for i, k in enumerate(self.topk):
+            num_corrects = correct[:, :k].sum()
+            num_samples = len(correct)
+            accs.append(float(num_corrects) / num_samples)
+            self.total[i] += num_corrects
+            self.count[i] += num_samples
+        return accs
+    def reset(self):
+        self.total = [0.] * len(self.topk)
+        self.count = [0] * len(self.topk)
+    def accumulate(self):
+        res = []
+        for t, c in zip(self.total, self.count):
+            res.append(float(t) / c)
+        return res
+    def _init_name(self, name):
+        name = name or 'acc'
+        if self.maxk != 1:
+            self._name = ['{}_top{}'.format(name, k) for k in self.topk]
+        else:
+            self._name = ['acc']
+    def name(self):
+        return self._name
--- a/hapi/model.py
+++ b/hapi/model.py
--- a/hapi/progressbar.py
+++ b/hapi/progressbar.py
+import sys
+import time
+import numpy as np
+class ProgressBar(object):
+    """progress bar """
+    def __init__(self,
+                 num=None,
+                 width=30,
+                 verbose=1,
+                 start=True,
+                 file=sys.stdout):
+        self._num = num
+        if isinstance(num, int) and num <= 0:
+            raise TypeError('num should be None or integer (> 0)')
+        max_width = self._get_max_width()
+        self._width = width if width <= max_width else max_width
+        self._total_width = 0
+        self._verbose = verbose
+        self.file = file
+        self._values = {}
+        self._values_order = []
+        if start:
+            self._start = time.time()
+        self._last_update = 0
+        self._dynamic_display = (
+            (hasattr(self.file, 'isatty') and
+             self.file.isatty()) or 'ipykernel' in sys.modules or
+            'posix' in sys.modules or 'PYCHARM_HOSTED' in os.environ)
+    def _get_max_width(self):
+        if sys.version_info > (3, 3):
+            from shutil import get_terminal_size
+        else:
+            from backports.shutil_get_terminal_size import get_terminal_size
+        terminal_width, _ = get_terminal_size()
+        max_width = min(int(terminal_width * 0.6), terminal_width - 50)
+        return max_width
+    def start(self):
+        self.file.flush()
+        self._start = time.time()
+    def update(self, current_num, values=None):
+        now = time.time()
+        if current_num:
+            time_per_unit = (now - self._start) / current_num
+        else:
+            time_per_unit = 0
+        if time_per_unit >= 1 or time_per_unit == 0:
+            fps = ' - %.0fs/%s' % (time_per_unit, 'step')
+        elif time_per_unit >= 1e-3:
+            fps = ' - %.0fms/%s' % (time_per_unit * 1e3, 'step')
+        else:
+            fps = ' - %.0fus/%s' % (time_per_unit * 1e6, 'step')
+        info = ''
+        if self._verbose == 1:
+            prev_total_width = self._total_width
+            if self._dynamic_display:
+                sys.stdout.write('\b' * prev_total_width)
+                sys.stdout.write('\r')
+            else:
+                sys.stdout.write('\n')
+            if self._num is not None:
+                numdigits = int(np.log10(self._num)) + 1
+                bar_chars = ('step %' + str(numdigits) + 'd/%d [') % (
+                    current_num, self._num)
+                prog = float(current_num) / self._num
+                prog_width = int(self._width * prog)
+                if prog_width > 0:
+                    bar_chars += ('=' * (prog_width - 1))
+                    if current_num < self._num:
+                        bar_chars += '>'
+                    else:
+                        bar_chars += '='
+                bar_chars += ('.' * (self._width - prog_width))
+                bar_chars += ']'
+            else:
+                bar_chars = 'step %3d' % current_num
+            self._total_width = len(bar_chars)
+            sys.stdout.write(bar_chars)
+            for k, val in values:
+                info += ' - %s:' % k
+                val = val if isinstance(val, list) else [val]
+                for i, v in enumerate(val):
+                    if isinstance(v, (float, np.float32, np.float64)):
+                        if abs(v) > 1e-3:
+                            info += ' %.4f' % v
+                        else:
+                            info += ' %.4e' % v
+                    else:
+                        info += ' %s' % v
+            if self._num is not None and current_num < self._num:
+                eta = time_per_unit * (self._num - current_num)
+                if eta > 3600:
+                    eta_format = '%d:%02d:%02d' % (eta // 3600, (eta % 3600) //
+                                                   60, eta % 60)
+                elif eta > 60:
+                    eta_format = '%d:%02d' % (eta // 60, eta % 60)
+                else:
+                    eta_format = '%ds' % eta
+                info += ' - ETA: %s' % eta_format
+            info += fps
+            self._total_width += len(info)
+            if prev_total_width > self._total_width:
+                info += (' ' * (prev_total_width - self._total_width))
+            # newline for another epoch
+            if self._num is not None and current_num >= self._num:
+                info += '\n'
+            if self._num is None:
+                info += '\n'
+            sys.stdout.write(info)
+            sys.stdout.flush()
+            self._last_update = now
+        elif self._verbose == 2:
+            if self._num:
+                numdigits = int(np.log10(self._num)) + 1
+                count = ('step %' + str(numdigits) + 'd/%d') % (current_num,
+                                                                self._num)
+            else:
+                count = 'step %3d' % current_num
+            info = count + info
+            for k, val in values:
+                info += ' - %s:' % k
+                val = val if isinstance(val, list) else [val]
+                for v in val:
+                    if isinstance(v, (float, np.float32, np.float64)):
+                        if abs(v) > 1e-3:
+                            info += ' %.4f' % v
+                        else:
+                            info += ' %.4e' % v
+                    elif isinstance(v, np.ndarray) and \
+                        v.size == 1 and \
+                        isinstance(v.dtype, (np.float32, np.float64)):
+                        if abs(v[0]) > 1e-3:
+                            info += ' %.4f' % v[0]
+                        else:
+                            info += ' %.4e' % v[0]
+                    else:
+                        info += ' %s' % v
+            info += fps
+            info += '\n'
+            sys.stdout.write(info)
+            sys.stdout.flush()
--- a/hapi/text/__init__.py
+++ b/hapi/text/__init__.py
--- a/hapi/text/bert/__init__.py
+++ b/hapi/text/bert/__init__.py
--- a/hapi/text/bert/batching.py
+++ b/hapi/text/bert/batching.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Mask, padding and batching."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
+    """
+    Add mask for batch_tokens, return out, mask_label, mask_pos;
+    Note: mask_pos responding the batch_tokens after padded;
+    """
+    max_len = max([len(sent) for sent in batch_tokens])
+    mask_label = []
+    mask_pos = []
+    prob_mask = np.random.rand(total_token_num)
+    # Note: the first token is [CLS], so [low=1]
+    replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
+    pre_sent_len = 0
+    prob_index = 0
+    for sent_index, sent in enumerate(batch_tokens):
+        mask_flag = False
+        prob_index += pre_sent_len
+        for token_index, token in enumerate(sent):
+            prob = prob_mask[prob_index + token_index]
+            if prob > 0.15:
+                continue
+            elif 0.03 < prob <= 0.15:
+                # mask
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    sent[token_index] = MASK
+                    mask_flag = True
+                    mask_pos.append(sent_index * max_len + token_index)
+            elif 0.015 < prob <= 0.03:
+                # random replace
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    sent[token_index] = replace_ids[prob_index + token_index]
+                    mask_flag = True
+                    mask_pos.append(sent_index * max_len + token_index)
+            else:
+                # keep the original token
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    mask_pos.append(sent_index * max_len + token_index)
+        pre_sent_len = len(sent)
+        # ensure at least mask one word in a sentence
+        while not mask_flag:
+            token_index = int(np.random.randint(1, high=len(sent) - 1, size=1))
+            if sent[token_index] != SEP and sent[token_index] != CLS:
+                mask_label.append(sent[token_index])
+                sent[token_index] = MASK
+                mask_flag = True
+                mask_pos.append(sent_index * max_len + token_index)
+    mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
+    mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
+    return batch_tokens, mask_label, mask_pos
+def prepare_batch_data(insts,
+                       total_token_num,
+                       voc_size=0,
+                       pad_id=None,
+                       cls_id=None,
+                       sep_id=None,
+                       mask_id=None,
+                       return_input_mask=True,
+                       return_max_len=True,
+                       return_num_token=False):
+    """
+    1. generate Tensor of data
+    2. generate Tensor of position
+    3. generate self attention mask, [shape: batch_size *  max_len * max_len]
+    """
+    batch_src_ids = [inst[0] for inst in insts]
+    batch_pos_ids = [inst[1] for inst in insts]
+    batch_sent_ids = [inst[2] for inst in insts]
+    labels_list = []
+    # compatible with squad, whose example includes start/end positions, 
+    # or unique id
+    for i in range(3, len(insts[0]), 1):
+        labels = [inst[i] for inst in insts]
+        labels = np.array(labels).astype("int64").reshape([-1, 1])
+        labels_list.append(labels)
+    # First step: do mask without padding
+    if mask_id >= 0:
+        out, mask_label, mask_pos = mask(
+            batch_src_ids,
+            total_token_num,
+            vocab_size=voc_size,
+            CLS=cls_id,
+            SEP=sep_id,
+            MASK=mask_id)
+    else:
+        out = batch_src_ids
+    # Second step: padding
+    src_id, self_input_mask = pad_batch_data(
+        out, pad_idx=pad_id, return_input_mask=True)
+    pos_id = pad_batch_data(
+        batch_pos_ids,
+        pad_idx=pad_id,
+        return_pos=False,
+        return_input_mask=False)
+    sent_id = pad_batch_data(
+        batch_sent_ids,
+        pad_idx=pad_id,
+        return_pos=False,
+        return_input_mask=False)
+    if mask_id >= 0:
+        return_list = [
+            src_id, pos_id, sent_id, self_input_mask, mask_label, mask_pos
+        ] + labels_list
+    else:
+        return_list = [src_id, pos_id, sent_id, self_input_mask] + labels_list
+    return return_list if len(return_list) > 1 else return_list[0]
+def pad_batch_data(insts,
+                   pad_idx=0,
+                   return_pos=False,
+                   return_input_mask=False,
+                   return_max_len=False,
+                   return_num_token=False):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and input mask.
+    """
+    return_list = []
+    max_len = max(len(inst) for inst in insts)
+    # Any token included in dict can be used to pad, since the paddings' loss
+    # will be masked out by weights and make no effect on parameter gradients.
+    inst_data = np.array([
+        list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts
+    ])
+    return_list += [inst_data.astype("int64").reshape([-1, max_len])]
+    # position data
+    if return_pos:
+        inst_pos = np.array([
+            list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
+            for inst in insts
+        ])
+        return_list += [inst_pos.astype("int64").reshape([-1, max_len])]
+    if return_input_mask:
+        # This is used to avoid attention on paddings.
+        input_mask_data = np.array([[1] * len(inst) + [0] *
+                                    (max_len - len(inst)) for inst in insts])
+        input_mask_data = np.expand_dims(input_mask_data, axis=-1)
+        return_list += [input_mask_data.astype("float32")]
+    if return_max_len:
+        return_list += [max_len]
+    if return_num_token:
+        num_token = 0
+        for inst in insts:
+            num_token += len(inst)
+        return_list += [num_token]
+    return return_list if len(return_list) > 1 else return_list[0]
+if __name__ == "__main__":
+    pass
--- a/hapi/text/bert/bert.py
+++ b/hapi/text/bert/bert.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"bert"
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import six
+import json
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, to_variable, Layer, guard
+from hapi.text.text import PrePostProcessLayer, TransformerEncoder
+from hapi.text.bert.utils.init import init_from_static_model
+class BertConfig(object):
+    def __init__(self, config_path):
+        self._config_dict = self._parse(config_path)
+    def _parse(self, config_path):
+        try:
+            with open(config_path) as json_file:
+                config_dict = json.load(json_file)
+        except Exception:
+            raise IOError("Error in parsing bert model config file '%s'" %
+                          config_path)
+        else:
+            return config_dict
+    def __getitem__(self, key):
+        return self._config_dict[key]
+    def print_config(self):
+        for arg, value in sorted(six.iteritems(self._config_dict)):
+            print('%s: %s' % (arg, value))
+        print('------------------------------------------------')
+class BertEncoder(Layer):
+    """
+    bert
+    """
+    def __init__(self, config, return_pooled_out=True, use_fp16=False):
+        super(BertEncoder, self).__init__()
+        self.config = config
+        self._emb_size = config['hidden_size']
+        self._n_layer = config['num_hidden_layers']
+        self._n_head = config['num_attention_heads']
+        self._voc_size = config['vocab_size']
+        self._max_position_seq_len = config['max_position_embeddings']
+        self._sent_types = config['type_vocab_size']
+        self._hidden_act = config['hidden_act']
+        self._prepostprocess_dropout = config['hidden_dropout_prob']
+        self._attention_dropout = config['attention_probs_dropout_prob']
+        self.return_pooled_out = return_pooled_out
+        self._word_emb_name = "word_embedding"
+        self._pos_emb_name = "pos_embedding"
+        self._sent_emb_name = "sent_embedding"
+        self._dtype = "float16" if use_fp16 else "float32"
+        self._param_initializer = fluid.initializer.TruncatedNormal(
+            scale=config['initializer_range'])
+        self._src_emb = Embedding(
+            size=[self._voc_size, self._emb_size],
+            param_attr=fluid.ParamAttr(
+                name=self._word_emb_name, initializer=self._param_initializer),
+            dtype=self._dtype)
+        self._pos_emb = Embedding(
+            size=[self._max_position_seq_len, self._emb_size],
+            param_attr=fluid.ParamAttr(
+                name=self._pos_emb_name, initializer=self._param_initializer),
+            dtype=self._dtype)
+        self._sent_emb = Embedding(
+            size=[self._sent_types, self._emb_size],
+            param_attr=fluid.ParamAttr(
+                name=self._sent_emb_name, initializer=self._param_initializer),
+            dtype=self._dtype)
+        self.pooled_fc = Linear(
+            input_dim=self._emb_size,
+            output_dim=self._emb_size,
+            param_attr=fluid.ParamAttr(
+                name="pooled_fc.w_0", initializer=self._param_initializer),
+            bias_attr="pooled_fc.b_0",
+            act="tanh")
+        self.pre_process_layer = PrePostProcessLayer(
+            "nd", self._emb_size, self._prepostprocess_dropout, None)
+        self._encoder = TransformerEncoder(
+            n_layer=self._n_layer,
+            n_head=self._n_head,
+            d_key=self._emb_size // self._n_head,
+            d_value=self._emb_size // self._n_head,
+            d_model=self._emb_size,
+            d_inner_hid=self._emb_size * 4,
+            prepostprocess_dropout=self._prepostprocess_dropout,
+            attention_dropout=self._attention_dropout,
+            relu_dropout=0,
+            preprocess_cmd="",
+            postprocess_cmd="dan",
+            ffn_fc1_act=self._hidden_act)
+    def init_parameters(self, param_path="", verbose=False):
+        init_from_static_model(param_path, self, self.config, verbose)
+    def forward(self, src_ids, position_ids, sentence_ids, input_mask):
+        """
+        forward
+        """
+        src_emb = self._src_emb(src_ids)
+        pos_emb = self._pos_emb(position_ids)
+        sent_emb = self._sent_emb(sentence_ids)
+        emb_out = src_emb + pos_emb
+        emb_out = emb_out + sent_emb
+        emb_out = self.pre_process_layer(emb_out)
+        self_attn_mask = fluid.layers.matmul(
+            x=input_mask, y=input_mask, transpose_y=True)
+        self_attn_mask = fluid.layers.scale(
+            x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
+        n_head_self_attn_mask = fluid.layers.stack(
+            x=[self_attn_mask] * self._n_head, axis=1)
+        n_head_self_attn_mask.stop_gradient = True
+        enc_output = self._encoder(emb_out, n_head_self_attn_mask)
+        if not self.return_pooled_out:
+            return enc_output
+        next_sent_feat = fluid.layers.slice(
+            input=enc_output, axes=[1], starts=[0], ends=[1])
+        next_sent_feat = self.pooled_fc(next_sent_feat)
+        next_sent_feat = fluid.layers.reshape(
+            next_sent_feat, shape=[-1, self._emb_size])
+        return enc_output, next_sent_feat
--- a/hapi/text/bert/data_processor.py
+++ b/hapi/text/bert/data_processor.py
--- a/hapi/text/bert/dataloader.py
+++ b/hapi/text/bert/dataloader.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+import os
+import six
+import csv
+import glob
+import tarfile
+import itertools
+from functools import partial
+import numpy as np
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.parallel import ParallelEnv
+from paddle.fluid.io import BatchSampler, DataLoader, Dataset
+from hapi.distributed import DistributedBatchSampler
+from hapi.text.bert.data_processor import DataProcessor, XnliProcessor, ColaProcessor, MrpcProcessor, MnliProcessor
+from hapi.text.bert.batching import prepare_batch_data
+import hapi.text.tokenizer.tokenization as tokenization
+__all__ = [
+    'BertInputExample', 'BertInputFeatures', 'SingleSentenceDataset',
+    'SentencePairDataset'
+]
+class BertInputExample(object):
+    def __init__(self, uid, text_a, text_b=None, label=None):
+        self.uid = uid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+class BertInputFeatures(object):
+    def __init__(self, input_ids, input_mask, segment_ids, label_id):
+        self.input_ids = input_ids
+        self.pos_ids = list(range(len(self.input_ids)))
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.label_id = label_id
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+def convert_single_example_to_unicode(guid, single_example):
+    text_a = tokenization.convert_to_unicode(single_example[0])
+    text_b = tokenization.convert_to_unicode(single_example[1])
+    label = tokenization.convert_to_unicode(single_example[2])
+    return BertInputExample(uid=uid, text_a=text_a, text_b=text_b, label=label)
+def convert_single_example(ex_index, example, label_list, max_seq_length,
+                           tokenizer):
+    """Converts a single `BertInputExample` into a single `BertInputFeatures`."""
+    label_map = {}
+    for (i, label) in enumerate(label_list):
+        label_map[label] = i
+    tokens_a = tokenizer.tokenize(example.text_a)
+    tokens_b = None
+    if example.text_b:
+        tokens_b = tokenizer.tokenize(example.text_b)
+    if tokens_b:
+        # Modifies `tokens_a` and `tokens_b` in place so that the total
+        # length is less than the specified length.
+        # Account for [CLS], [SEP], [SEP] with "- 3"
+        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+    else:
+        # Account for [CLS] and [SEP] with "- 2"
+        if len(tokens_a) > max_seq_length - 2:
+            tokens_a = tokens_a[0:(max_seq_length - 2)]
+    tokens = []
+    segment_ids = []
+    tokens.append("[CLS]")
+    segment_ids.append(0)
+    for token in tokens_a:
+        tokens.append(token)
+        segment_ids.append(0)
+    tokens.append("[SEP]")
+    segment_ids.append(0)
+    if tokens_b:
+        for token in tokens_b:
+            tokens.append(token)
+            segment_ids.append(1)
+        tokens.append("[SEP]")
+        segment_ids.append(1)
+    input_ids = tokenizer.convert_tokens_to_ids(tokens)
+    input_mask = [1] * len(input_ids)
+    label_id = label_map[example.label]
+    feature = BertInputFeatures(
+        input_ids=input_ids,
+        input_mask=input_mask,
+        segment_ids=segment_ids,
+        label_id=label_id)
+    return feature
+def convert_examples_to_features(examples, label_list, max_seq_length,
+                                 tokenizer):
+    """Convert a set of `InputExample`s to a list of `InputFeatures`."""
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        if ex_index % 10000 == 0:
+            print("Writing example %d of %d" % (ex_index, len(examples)))
+        feature = convert_single_example(ex_index, example, label_list,
+                                         max_seq_length, tokenizer)
+        features.append(feature)
+    return features
+def _read_tsv(input_file, delimiter="\t", quotechar=None):
+    """Reads a tab separated value file."""
+    with io.open(input_file, "r", encoding="utf8") as f:
+        reader = csv.reader(f, delimiter=delimiter, quotechar=quotechar)
+        lines = []
+        for line in reader:
+            lines.append(line)
+        return lines
+class SingleSentenceDataset(Dataset):
+    def __init__(self,
+                 tokenizer,
+                 label_list,
+                 max_seq_length,
+                 mode="all_in_memory"):
+        assert isinstance(mode,
+                          str), "mode of SingleSentenceDataset should be str"
+        assert mode in [
+            "all_in_memory", "leveldb"
+        ], "mode of SingleSentenceDataset should be in [all_in_memory, leveldb], but get" % mode
+        self.examples = []
+    def load_all_data_in_memory(self,
+                                input_file,
+                                label_list,
+                                max_seq_length,
+                                tokenizer,
+                                line_processor=None,
+                                delimiter="\t",
+                                quotechar=None):
+        lines = _read_tsv(input_file, delimiter=delimiter, quotechar=quotechar)
+        def default_line_processor(line_id, line):
+            assert len(line) == 2
+            text_a = line[0]
+            label = line[1]
+            return BertInputExample(
+                str(line_id), text_a=text_a, text_b=None, label=label)
+        if line_processor is None:
+            line_processor = default_line_processor
+        for (line_id, line) in enumerate(lines):
+            input_example = line_processor(line_id, line)
+            if not input_example:
+                continue
+            input_feature = convert_single_example(
+                str(line_id), input_example, label_list, max_seq_length,
+                tokenizer)
+            self.examples.append(input_feature)
+    def __getitem__(self, idx):
+        return self.examples[idx].input_ids, self.examples[
+            idx].pos_ids, self.examples[idx].segment_ids, self.examples[
+                idx].label_id
+    def __len__(self):
+        return len(self.examples)
+class SentencePairDataset(Dataset):
+    def __init__(self,
+                 tokenizer,
+                 label_ist,
+                 max_seq_length,
+                 mode="all_in_memory"):
+        assert isinstance(mode,
+                          str), "mode of SentencePairDataset should be str"
+        assert mode in [
+            "all_in_memory", "leveldb"
+        ], "mode of SentencePairDataset should be in [all_in_memory, leveldb], but get" % mode
+        self.examples = []
+    def load_all_data_in_memory(self,
+                                input_file,
+                                label_list,
+                                max_seq_length,
+                                tokenizer,
+                                line_processor=None,
+                                delimiter="\t",
+                                quotechar=None):
+        lines = _read_tsv(input_file, delimiter=delimiter, quotechar=quotechar)
+        def default_line_processor(line_id, line):
+            assert len(line) == 3
+            text_a = line[0]
+            text_b = line[1]
+            label = line[2]
+            return BertInputExample(
+                str(line_id), text_a=text_a, text_b=text_b, label=label)
+        if line_processor is None:
+            line_processor = default_line_processor
+        for (line_id, line) in enumerate(lines):
+            input_example = line_processor(line_id, line)
+            if not input_example:
+                continue
+            input_feature = convert_single_example(
+                str(line_id), input_example, label_list, max_seq_length,
+                tokenizer)
+            self.examples.append(input_feature)
+    def __getitem__(self, idx):
+        return self.examples[idx].input_ids, self.examples[
+            idx].pos_ids, self.examples[idx].segment_ids, self.examples[
+                idx].label_id
+    def __len__(self):
+        return len(self.examples)
+def _prepare_train_batch(insts,
+                         vocab_size=0,
+                         pad_id=None,
+                         cls_id=None,
+                         sep_id=None,
+                         mask_id=-1,
+                         return_input_mask=True,
+                         return_max_len=True,
+                         return_num_token=False):
+    return prepare_batch_data(
+        insts,
+        0,
+        voc_size=vocab_size,
+        pad_id=pad_id,
+        cls_id=cls_id,
+        sep_id=sep_id,
+        mask_id=mask_id,
+        return_input_mask=return_input_mask,
+        return_max_len=return_max_len,
+        return_num_token=return_num_token)
+class SingleSentenceDataLoader(object):
+    def __init__(self,
+                 input_file,
+                 tokenizer,
+                 label_list,
+                 max_seq_length,
+                 batch_size,
+                 shuffle=False,
+                 drop_last=False,
+                 mode="all_in_memory",
+                 line_processor=None,
+                 delimiter="\t",
+                 quotechar=None,
+                 device=fluid.CPUPlace(),
+                 num_workers=0,
+                 return_list=True):
+        self.dataset = SingleSentenceDataset(tokenizer, label_list,
+                                             max_seq_length, mode)
+        if mode == "all_in_memory":
+            self.dataset.load_all_data_in_memory(
+                input_file, label_list, max_seq_length, tokenizer,
+                line_processor, delimiter, quotechar)
+        elif mode == "leveldb":
+            #TODO add leveldb reader
+            pass
+        else:
+            raise ValueError("mode should be in [all_in_memory, leveldb]")
+        self.sampler = DistributedBatchSampler(
+            self.dataset, batch_size, shuffle=shuffle, drop_last=drop_last)
+        self.dataloader = DataLoader(
+            dataset=self.dataset,
+            batch_sampler=self.sampler,
+            places=device,
+            collate_fn=partial(
+                _prepare_train_batch,
+                vocab_size=-1,
+                pad_id=tokenizer.vocab["[PAD]"],
+                cls_id=tokenizer.vocab["[CLS]"],
+                sep_id=tokenizer.vocab["[SEP]"],
+                mask_id=-1,
+                return_input_mask=True,
+                return_max_len=False,
+                return_num_token=False),
+            num_workers=num_workers,
+            return_list=return_list)
+if __name__ == "__main__":
+    print("hello world.")
--- a/hapi/text/bert/optimization.py
+++ b/hapi/text/bert/optimization.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Optimization and learning rate scheduling."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
+class ConstantLR(LearningRateDecay):
+    def __init__(self, learning_rate, begin=0, step=1, dtype='float32'):
+        super(ConstantLR, self).__init__(begin, step, dtype)
+        self.learning_rate = learning_rate
+    def step(self):
+        return self.learning_rate
+class LinearDecay(LearningRateDecay):
+    def __init__(self,
+                 learning_rate,
+                 warmup_steps,
+                 decay_steps,
+                 end_learning_rate=0.0001,
+                 power=1.0,
+                 cycle=False,
+                 begin=0,
+                 step=1,
+                 dtype='float32'):
+        super(LinearDecay, self).__init__(begin, step, dtype)
+        self.learning_rate = learning_rate
+        self.warmup_steps = warmup_steps
+        self.decay_steps = decay_steps
+        self.end_learning_rate = end_learning_rate
+        self.power = power
+        self.cycle = cycle
+    def step(self):
+        if self.step_num < self.warmup_steps:
+            decayed_lr = self.learning_rate * (self.step_num /
+                                               self.warmup_steps)
+            decayed_lr = self.create_lr_var(decayed_lr)
+        else:
+            tmp_step_num = self.step_num
+            tmp_decay_steps = self.decay_steps
+            if self.cycle:
+                div_res = fluid.layers.ceil(
+                    self.create_lr_var(tmp_step_num / float(self.decay_steps)))
+                if tmp_step_num == 0:
+                    div_res = self.create_lr_var(1.0)
+                tmp_decay_steps = self.decay_steps * div_res
+            else:
+                tmp_step_num = self.create_lr_var(
+                    tmp_step_num
+                    if tmp_step_num < self.decay_steps else self.decay_steps)
+                decayed_lr = (self.learning_rate - self.end_learning_rate) * \
+                    ((1 - tmp_step_num / tmp_decay_steps) ** self.power) + self.end_learning_rate
+        return decayed_lr
+class Optimizer(object):
+    def __init__(self,
+                 warmup_steps,
+                 num_train_steps,
+                 learning_rate,
+                 model_cls,
+                 weight_decay,
+                 scheduler='linear_warmup_decay',
+                 loss_scaling=1.0,
+                 parameter_list=None):
+        self.warmup_steps = warmup_steps
+        self.num_train_steps = num_train_steps
+        self.learning_rate = learning_rate
+        self.model_cls = model_cls
+        self.weight_decay = weight_decay
+        self.scheduler = scheduler
+        self.loss_scaling = loss_scaling
+        self.parameter_list = parameter_list
+        self.scheduled_lr = 0.0
+        self.optimizer = self.lr_schedule()
+    def lr_schedule(self):
+        if self.warmup_steps > 0:
+            if self.scheduler == 'noam_decay':
+                self.scheduled_lr = fluid.dygraph.NoamDecay(1 / (
+                    self.warmup_steps * (self.learning_rate**2)),
+                                                            self.warmup_steps)
+            elif self.scheduler == 'linear_warmup_decay':
+                self.scheduled_lr = LinearDecay(self.learning_rate,
+                                                self.warmup_steps,
+                                                self.num_train_steps, 0.0)
+            else:
+                raise ValueError("Unkown learning rate scheduler, should be "
+                                 "'noam_decay' or 'linear_warmup_decay'")
+            optimizer = fluid.optimizer.Adam(
+                learning_rate=self.scheduled_lr,
+                parameter_list=self.parameter_list)
+        else:
+            self.scheduled_lr = ConstantLR(self.learning_rate)
+            optimizer = fluid.optimizer.Adam(
+                learning_rate=self.scheduled_lr,
+                parameter_list=self.parameter_list)
+        return optimizer
+    def exclude_from_weight_decay(self, name):
+        if name.find("layer_norm") > -1:
+            return True
+        bias_suffix = ["_bias", "_b", ".b_0"]
+        for suffix in bias_suffix:
+            if name.endswith(suffix):
+                return True
+        return False
+    def minimize(self, loss, use_data_parallel=False, model=None):
+        param_list = dict()
+        clip_norm_thres = 1.0
+        #grad_clip = fluid.clip.GradientClipByGlobalNorm(clip_norm_thres)
+        if use_data_parallel:
+            loss = model.scale_loss(loss)
+        loss.backward()
+        if self.weight_decay > 0:
+            for param in self.model_cls.parameters():
+                param_list[param.name] = param * 1.0
+                param_list[param.name].stop_gradient = True
+        if use_data_parallel:
+            assert model is not None
+            model.apply_collective_grads()
+        #_, param_grads = self.optimizer.minimize(loss, grad_clip=grad_clip)
+        _, param_grads = self.optimizer.minimize(loss)
+        if self.weight_decay > 0:
+            for param, grad in param_grads:
+                if self.exclude_from_weight_decay(param.name):
+                    continue
+                if isinstance(self.scheduled_lr.step(), float):
+                    updated_param = param.numpy() - param_list[
+                        param.name].numpy(
+                        ) * self.weight_decay * self.scheduled_lr.step()
+                else:
+                    updated_param = param.numpy(
+                    ) - param_list[param.name].numpy(
+                    ) * self.weight_decay * self.scheduled_lr.step().numpy()
+                updated_param_var = fluid.dygraph.to_variable(updated_param)
+                param = updated_param_var
+                #param = fluid.layers.reshape(x=updated_param_var, shape=list(updated_param_var.shape))
--- a/hapi/text/bert/static_optimization.py
+++ b/hapi/text/bert/static_optimization.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Optimization and learning rate scheduling."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import paddle.fluid as fluid
+from utils.fp16 import create_master_params_grads, master_param_to_train_param, apply_dynamic_loss_scaling
+def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps):
+    """ Applies linear warmup of learning rate from 0 and decay to 0."""
+    with fluid.default_main_program()._lr_schedule_guard():
+        lr = fluid.layers.tensor.create_global_var(
+            shape=[1],
+            value=0.0,
+            dtype='float32',
+            persistable=True,
+            name="scheduled_learning_rate")
+        global_step = fluid.layers.learning_rate_scheduler._decay_step_counter(
+        )
+        with fluid.layers.control_flow.Switch() as switch:
+            with switch.case(global_step < warmup_steps):
+                warmup_lr = learning_rate * (global_step / warmup_steps)
+                fluid.layers.tensor.assign(warmup_lr, lr)
+            with switch.default():
+                decayed_lr = fluid.layers.learning_rate_scheduler.polynomial_decay(
+                    learning_rate=learning_rate,
+                    decay_steps=num_train_steps,
+                    end_learning_rate=0.0,
+                    power=1.0,
+                    cycle=False)
+                fluid.layers.tensor.assign(decayed_lr, lr)
+        return lr
+def optimization(loss,
+                 warmup_steps,
+                 num_train_steps,
+                 learning_rate,
+                 train_program,
+                 startup_prog,
+                 weight_decay,
+                 scheduler='linear_warmup_decay',
+                 use_fp16=False,
+                 use_dynamic_loss_scaling=False,
+                 init_loss_scaling=1.0,
+                 incr_every_n_steps=1000,
+                 decr_every_n_nan_or_inf=2,
+                 incr_ratio=2.0,
+                 decr_ratio=0.8):
+    scheduled_lr, loss_scaling = None, None
+    if scheduler == 'noam_decay':
+        if warmup_steps > 0:
+            scheduled_lr = fluid.layers.learning_rate_scheduler\
+             .noam_decay(1/(warmup_steps *(learning_rate ** 2)),
+           warmup_steps)
+        else:
+            print(
+                "WARNING: noam decay of learning rate should have postive warmup "
+                "steps but given {}, using constant learning rate instead!"
+                .format(warmup_steps))
+            scheduled_lr = fluid.layers.create_global_var(
+                name=fluid.unique_name.generate("learning_rate"),
+                shape=[1],
+                value=learning_rate,
+                dtype='float32',
+                persistable=True)
+    elif scheduler == 'linear_warmup_decay':
+        if warmup_steps > 0:
+            scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps,
+                                               num_train_steps)
+        else:
+            print(
+                "WARNING: linear warmup decay of learning rate should have "
+                "postive warmup steps but given {}, use constant learning rate "
+                "instead!".format(warmup_steps))
+            scheduled_lr = fluid.layers.create_global_var(
+                name=fluid.unique_name.generate("learning_rate"),
+                shape=[1],
+                value=learning_rate,
+                dtype='float32',
+                persistable=True)
+    else:
+        raise ValueError("Unkown learning rate scheduler, should be "
+                         "'noam_decay' or 'linear_warmup_decay'")
+    optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
+    fluid.clip.set_gradient_clip(
+        clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))
+    def exclude_from_weight_decay(param):
+        name = param.name.rstrip(".master")
+        if name.find("layer_norm") > -1:
+            return True
+        bias_suffix = ["_bias", "_b", ".b_0"]
+        for suffix in bias_suffix:
+            if name.endswith(suffix):
+                return True
+        return False
+    param_list = dict()
+    if use_fp16:
+        loss_scaling = fluid.layers.create_global_var(
+            name=fluid.unique_name.generate("loss_scaling"),
+            shape=[1],
+            value=init_loss_scaling,
+            dtype='float32',
+            persistable=True)
+        loss *= loss_scaling
+        param_grads = optimizer.backward(loss)
+        master_param_grads = create_master_params_grads(
+            param_grads, train_program, startup_prog, loss_scaling)
+        if weight_decay > 0:
+            for param, _ in master_param_grads:
+                param_list[param.name] = param * 1.0
+                param_list[param.name].stop_gradient = True
+        if use_dynamic_loss_scaling:
+            apply_dynamic_loss_scaling(
+                loss_scaling, master_param_grads, incr_every_n_steps,
+                decr_every_n_nan_or_inf, incr_ratio, decr_ratio)
+        optimizer.apply_gradients(master_param_grads)
+        if weight_decay > 0:
+            for param, grad in master_param_grads:
+                if exclude_from_weight_decay(param):
+                    continue
+                with param.block.program._optimized_guard(
+                    [param, grad]), fluid.framework.name_scope("weight_decay"):
+                    updated_param = param - param_list[
+                        param.name] * weight_decay * scheduled_lr
+                    fluid.layers.assign(output=param, input=updated_param)
+        master_param_to_train_param(master_param_grads, param_grads,
+                                    train_program)
+    else:
+        if weight_decay > 0:
+            for param in train_program.all_parameters():
+                param_list[param.name] = param * 1.0
+                param_list[param.name].stop_gradient = True
+        _, param_grads = optimizer.minimize(loss)
+        if weight_decay > 0:
+            for param, grad in param_grads:
+                if exclude_from_weight_decay(param):
+                    continue
+                with param.block.program._optimized_guard(
+                    [param, grad]), fluid.framework.name_scope("weight_decay"):
+                    updated_param = param - param_list[
+                        param.name] * weight_decay * scheduled_lr
+                    fluid.layers.assign(output=param, input=updated_param)
+    return scheduled_lr, loss_scaling
--- a/hapi/text/bert/utils/__init__.py
+++ b/hapi/text/bert/utils/__init__.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from hapi.text.bert.utils.args import str2bool as str2bool
+from hapi.text.bert.utils.args import ArgumentGroup as ArgumentGroup
+from hapi.text.bert.utils.args import print_arguments as print_arguments
+from hapi.text.bert.utils.args import check_cuda as check_cuda
+from hapi.text.bert.utils.cards import get_cards as get_cards
+from hapi.text.bert.utils.fp16 import cast_fp16_to_fp32 as cast_fp16_to_fp32
+from hapi.text.bert.utils.fp16 import cast_fp32_to_fp16 as cast_fp32_to_fp16
+from hapi.text.bert.utils.fp16 import copy_to_master_param as copy_to_master_param
+from hapi.text.bert.utils.fp16 import create_master_params_grads as create_master_params_grads
+from hapi.text.bert.utils.fp16 import master_param_to_train_param as master_param_to_train_param
+from hapi.text.bert.utils.init import init_checkpoint as init_checkpoint
+from hapi.text.bert.utils.init import init_pretraining_params as init_pretraining_params
+from hapi.text.bert.utils.init import init_from_static_model as init_from_static_model
--- a/hapi/text/bert/utils/args.py
+++ b/hapi/text/bert/utils/args.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Arguments for configuration."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import six
+import argparse
+import paddle.fluid as fluid
+def str2bool(v):
+    # because argparse does not support to parse "true, False" as python
+    # boolean directly
+    return v.lower() in ("true", "t", "1")
+class ArgumentGroup(object):
+    def __init__(self, parser, title, des):
+        self._group = parser.add_argument_group(title=title, description=des)
+    def add_arg(self, name, type, default, help, **kwargs):
+        type = str2bool if type == bool else type
+        self._group.add_argument(
+            "--" + name,
+            default=default,
+            type=type,
+            help=help + ' Default: %(default)s.',
+            **kwargs)
+def print_arguments(args):
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(six.iteritems(vars(args))):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+def check_cuda(use_cuda, err = \
+    "\nYou can not set use_cuda = True in the model because you are using paddlepaddle-cpu.\n \
+    Please: 1. Install paddlepaddle-gpu to run your models on GPU or 2. Set use_cuda = False to run models on CPU.\n"
+                                                                                                                     ):
+    try:
+        if use_cuda == True and fluid.is_compiled_with_cuda() == False:
+            print(err)
+            sys.exit(1)
+    except Exception as e:
+        pass
--- a/hapi/text/bert/utils/cards.py
+++ b/hapi/text/bert/utils/cards.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+def get_cards():
+    """
+    get gpu cards number
+    """
+    num = 0
+    cards = os.environ.get('CUDA_VISIBLE_DEVICES', '')
+    if cards != '':
+        num = len(cards.split(","))
+    return num
--- a/hapi/text/bert/utils/convert_static_to_dygraph.py
+++ b/hapi/text/bert/utils/convert_static_to_dygraph.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import shutil
+import sys
+import os
+def usage():
+    """
+    usage information
+    """
+    print
+    print("please use command: ")
+    print(
+        "python convert_static_to_dygraph.py input_params_dir output_params_dir"
+    )
+    print
+def convert_static_to_dygraph(static_model_path, dygraph_model_path):
+    """
+    convert paddle static bert model to dygraph model 
+    """
+    def mkdir(path):
+        if not os.path.isdir(path):
+            if os.path.split(path)[0]:
+                mkdir(os.path.split(path)[0])
+        else:
+            return
+        os.mkdir(path)
+    if os.path.exists(dygraph_model_path):
+        shutil.rmtree(dygraph_model_path)
+    mkdir(dygraph_model_path)
+    if not os.path.exists(static_model_path):
+        print("paddle static model path doesn't exist.....")
+        return -1
+    file_list = []
+    for root, dirs, files in os.walk(static_model_path):
+        file_list.extend(files)
+    os.makedirs(os.path.join(dygraph_model_path, "PretrainModelLayer_0"))
+    os.makedirs(
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/BertModelLayer_0"))
+    os.makedirs(
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/PrePostProcessLayer_0"))
+    os.makedirs(
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/BertModelLayer_0/PrePostProcessLayer_0"))
+    #os.chdir(static_model_path)
+    #convert embedding file
+    embedding_type = ["word", "pos", "sent"]
+    for i in range(3):
+        src_name = embedding_type[i] + "_embedding"
+        trg_name = "Embedding_" + str(i) + "." + src_name
+        shutil.copyfile(
+            os.path.join(static_model_path, src_name),
+            os.path.join(dygraph_model_path,
+                         "PretrainModelLayer_0/BertModelLayer_0/" + trg_name))
+    #convert pre_encoder file
+    shutil.copyfile(
+        os.path.join(static_model_path, "pre_encoder_layer_norm_scale"),
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/BertModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_scale"
+        ))
+    shutil.copyfile(
+        os.path.join(static_model_path, "pre_encoder_layer_norm_bias"),
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/BertModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_bias"
+        ))
+    #convert mask lm params file
+    shutil.copyfile(
+        os.path.join(static_model_path, "mask_lm_out_fc.b_0"),
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/Layer_0.mask_lm_out_fc.b_0"))
+    shutil.copyfile(
+        os.path.join(static_model_path, "mask_lm_trans_fc.b_0"),
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/FC_0.mask_lm_trans_fc.b_0"))
+    shutil.copyfile(
+        os.path.join(static_model_path, "mask_lm_trans_fc.w_0"),
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/FC_0.mask_lm_trans_fc.w_0"))
+    shutil.copyfile(
+        os.path.join(static_model_path, "mask_lm_trans_layer_norm_bias"),
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_bias"
+        ))
+    shutil.copyfile(
+        os.path.join(static_model_path, "mask_lm_trans_layer_norm_scale"),
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_scale"
+        ))
+    shutil.copyfile(
+        os.path.join(static_model_path, "next_sent_fc.b_0"),
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/FC_1.next_sent_fc.b_0"))
+    shutil.copyfile(
+        os.path.join(static_model_path, "next_sent_fc.w_0"),
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/FC_1.next_sent_fc.w_0"))
+    shutil.copyfile(
+        os.path.join(static_model_path, "pooled_fc.b_0"),
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/BertModelLayer_0/FC_0.pooled_fc.b_0"))
+    shutil.copyfile(
+        os.path.join(static_model_path, "pooled_fc.w_0"),
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/BertModelLayer_0/FC_0.pooled_fc.w_0"))
+    encoder_num = 0
+    for f in file_list:
+        if not f.startswith("encoder_layer"):
+            continue
+        layer_num = f.split('_')[2]
+        if int(layer_num) > encoder_num:
+            encoder_num = int(layer_num)
+    encoder_num += 1
+    for i in range(encoder_num):
+        encoder_dir = "EncoderSubLayer_" + str(i)
+        os.makedirs(
+            os.path.join(dygraph_model_path,
+                         "PretrainModelLayer_0/BertModelLayer_0/" +
+                         "EncoderLayer_0/", encoder_dir))
+        os.makedirs(
+            os.path.join(dygraph_model_path,
+                         "PretrainModelLayer_0/BertModelLayer_0/" +
+                         "EncoderLayer_0/", encoder_dir +
+                         "/PositionwiseFeedForwardLayer_0"))
+        os.makedirs(
+            os.path.join(
+                dygraph_model_path, "PretrainModelLayer_0/BertModelLayer_0/" +
+                "EncoderLayer_0/", encoder_dir + "/MultiHeadAttentionLayer_0"))
+        os.makedirs(
+            os.path.join(
+                dygraph_model_path, "PretrainModelLayer_0/BertModelLayer_0/" +
+                "EncoderLayer_0/", encoder_dir + "/PrePostProcessLayer_1"))
+        os.makedirs(
+            os.path.join(
+                dygraph_model_path, "PretrainModelLayer_0/BertModelLayer_0/" +
+                "EncoderLayer_0/", encoder_dir + "/PrePostProcessLayer_3"))
+    encoder_map_dict = {
+        "ffn_fc_0.b_0":
+        ("PositionwiseFeedForwardLayer_0", "FC_0.ffn_fc_0.b_0"),
+        "ffn_fc_0.w_0":
+        ("PositionwiseFeedForwardLayer_0", "FC_0.ffn_fc_0.w_0"),
+        "ffn_fc_1.b_0":
+        ("PositionwiseFeedForwardLayer_0", "FC_1.ffn_fc_1.b_0"),
+        "ffn_fc_1.w_0":
+        ("PositionwiseFeedForwardLayer_0", "FC_1.ffn_fc_1.w_0"),
+        "multi_head_att_key_fc.b_0":
+        ("MultiHeadAttentionLayer_0", "FC_1.key_fc.b_0"),
+        "multi_head_att_key_fc.w_0":
+        ("MultiHeadAttentionLayer_0", "FC_1.key_fc.w_0"),
+        "multi_head_att_output_fc.b_0":
+        ("MultiHeadAttentionLayer_0", "FC_3.output_fc.b_0"),
+        "multi_head_att_output_fc.w_0":
+        ("MultiHeadAttentionLayer_0", "FC_3.output_fc.w_0"),
+        "multi_head_att_query_fc.b_0":
+        ("MultiHeadAttentionLayer_0", "FC_0.query_fc.b_0"),
+        "multi_head_att_query_fc.w_0":
+        ("MultiHeadAttentionLayer_0", "FC_0.query_fc.w_0"),
+        "multi_head_att_value_fc.b_0":
+        ("MultiHeadAttentionLayer_0", "FC_2.value_fc.b_0"),
+        "multi_head_att_value_fc.w_0":
+        ("MultiHeadAttentionLayer_0", "FC_2.value_fc.w_0"),
+        "post_att_layer_norm_bias":
+        ("PrePostProcessLayer_1", "LayerNorm_0.post_att_layer_norm_bias"),
+        "post_att_layer_norm_scale":
+        ("PrePostProcessLayer_1", "LayerNorm_0.post_att_layer_norm_scale"),
+        "post_ffn_layer_norm_bias":
+        ("PrePostProcessLayer_3", "LayerNorm_0.post_ffn_layer_norm_bias"),
+        "post_ffn_layer_norm_scale":
+        ("PrePostProcessLayer_3", "LayerNorm_0.post_ffn_layer_norm_scale")
+    }
+    for f in file_list:
+        if not f.startswith("encoder_layer"):
+            continue
+        layer_num = f.split('_')[2]
+        suffix_name = "_".join(f.split('_')[3:])
+        in_dir = encoder_map_dict[suffix_name][0]
+        rename = encoder_map_dict[suffix_name][1]
+        encoder_layer = "EncoderSubLayer_" + layer_num
+        shutil.copyfile(
+            os.path.join(static_model_path, f),
+            os.path.join(
+                dygraph_model_path,
+                "PretrainModelLayer_0/BertModelLayer_0/EncoderLayer_0/" +
+                encoder_layer + "/" + in_dir + "/" + rename))
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        usage()
+        exit(1)
+    static_model_path = sys.argv[1]
+    dygraph_model_path = sys.argv[2]
+    convert_static_to_dygraph(static_model_path, dygraph_model_path)
--- a/hapi/text/bert/utils/fp16.py
+++ b/hapi/text/bert/utils/fp16.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+def cast_fp16_to_fp32(i, o, prog):
+    prog.global_block().append_op(
+        type="cast",
+        inputs={"X": i},
+        outputs={"Out": o},
+        attrs={
+            "in_dtype": fluid.core.VarDesc.VarType.FP16,
+            "out_dtype": fluid.core.VarDesc.VarType.FP32
+        })
+def cast_fp32_to_fp16(i, o, prog):
+    prog.global_block().append_op(
+        type="cast",
+        inputs={"X": i},
+        outputs={"Out": o},
+        attrs={
+            "in_dtype": fluid.core.VarDesc.VarType.FP32,
+            "out_dtype": fluid.core.VarDesc.VarType.FP16
+        })
+def copy_to_master_param(p, block):
+    v = block.vars.get(p.name, None)
+    if v is None:
+        raise ValueError("no param name %s found!" % p.name)
+    new_p = fluid.framework.Parameter(
+        block=block,
+        shape=v.shape,
+        dtype=fluid.core.VarDesc.VarType.FP32,
+        type=v.type,
+        lod_level=v.lod_level,
+        stop_gradient=p.stop_gradient,
+        trainable=p.trainable,
+        optimize_attr=p.optimize_attr,
+        regularizer=p.regularizer,
+        gradient_clip_attr=p.gradient_clip_attr,
+        error_clip=p.error_clip,
+        name=v.name + ".master")
+    return new_p
+def create_master_params_grads(params_grads, main_prog, startup_prog,
+                               loss_scaling):
+    master_params_grads = []
+    tmp_role = main_prog._current_role
+    OpRole = fluid.core.op_proto_and_checker_maker.OpRole
+    main_prog._current_role = OpRole.Backward
+    for p, g in params_grads:
+        # create master parameters
+        master_param = copy_to_master_param(p, main_prog.global_block())
+        startup_master_param = startup_prog.global_block()._clone_variable(
+            master_param)
+        startup_p = startup_prog.global_block().var(p.name)
+        cast_fp16_to_fp32(startup_p, startup_master_param, startup_prog)
+        # cast fp16 gradients to fp32 before apply gradients
+        if g.name.find("layer_norm") > -1:
+            if loss_scaling > 1:
+                scaled_g = g / float(loss_scaling)
+            else:
+                scaled_g = g
+            master_params_grads.append([p, scaled_g])
+            continue
+        master_grad = fluid.layers.cast(g, "float32")
+        if loss_scaling > 1:
+            master_grad = master_grad / float(loss_scaling)
+        master_params_grads.append([master_param, master_grad])
+    main_prog._current_role = tmp_role
+    return master_params_grads
+def master_param_to_train_param(master_params_grads, params_grads, main_prog):
+    for idx, m_p_g in enumerate(master_params_grads):
+        train_p, _ = params_grads[idx]
+        if train_p.name.find("layer_norm") > -1:
+            continue
+        with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]):
+            cast_fp32_to_fp16(m_p_g[0], train_p, main_prog)
--- a/hapi/text/bert/utils/init.py
+++ b/hapi/text/bert/utils/init.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import os
+import six
+import ast
+import copy
+import numpy as np
+import paddle.fluid as fluid
+def cast_fp32_to_fp16(exe, main_program):
+    print("Cast parameters to float16 data format.")
+    for param in main_program.global_block().all_parameters():
+        if not param.name.endswith(".master"):
+            param_t = fluid.global_scope().find_var(param.name).get_tensor()
+            data = np.array(param_t)
+            if param.name.find("layer_norm") == -1:
+                param_t.set(np.float16(data).view(np.uint16), exe.place)
+            master_param_var = fluid.global_scope().find_var(param.name +
+                                                             ".master")
+            if master_param_var is not None:
+                master_param_var.get_tensor().set(data, exe.place)
+def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False):
+    assert os.path.exists(
+        init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path
+    def existed_persitables(var):
+        if not fluid.io.is_persistable(var):
+            return False
+        return os.path.exists(os.path.join(init_checkpoint_path, var.name))
+    fluid.io.load_vars(
+        exe,
+        init_checkpoint_path,
+        main_program=main_program,
+        predicate=existed_persitables)
+    print("Load model from {}".format(init_checkpoint_path))
+    if use_fp16:
+        cast_fp32_to_fp16(exe, main_program)
+def init_pretraining_params(exe,
+                            pretraining_params_path,
+                            main_program,
+                            use_fp16=False):
+    assert os.path.exists(pretraining_params_path
+                          ), "[%s] cann't be found." % pretraining_params_path
+    def existed_params(var):
+        if not isinstance(var, fluid.framework.Parameter):
+            return False
+        return os.path.exists(os.path.join(pretraining_params_path, var.name))
+    fluid.io.load_vars(
+        exe,
+        pretraining_params_path,
+        main_program=main_program,
+        predicate=existed_params)
+    print("Load pretraining parameters from {}.".format(
+        pretraining_params_path))
+    if use_fp16:
+        cast_fp32_to_fp16(exe, main_program)
+def init_from_static_model(dir_path,
+                           backbone_model,
+                           bert_config,
+                           verbose=False):
+    def load_numpy_weight(file_name):
+        if six.PY2:
+            res = np.load(os.path.join(dir_path, file_name), allow_pickle=True)
+        else:
+            res = np.load(
+                os.path.join(dir_path, file_name),
+                allow_pickle=True,
+                encoding='latin1')
+        assert res is not None
+        return res
+    # load word embedding
+    _param = load_numpy_weight("word_embedding")
+    backbone_model._src_emb.set_dict({"weight": _param})
+    if verbose:
+        print("INIT word embedding")
+    _param = load_numpy_weight("pos_embedding")
+    backbone_model._pos_emb.set_dict({"weight": _param})
+    if verbose:
+        print("INIT pos embedding")
+    _param = load_numpy_weight("sent_embedding")
+    backbone_model._sent_emb.set_dict({"weight": _param})
+    if verbose:
+        print("INIT sent embedding")
+    _param0 = load_numpy_weight("pooled_fc.w_0")
+    _param1 = load_numpy_weight("pooled_fc.b_0")
+    backbone_model.pooled_fc.set_dict({"weight": _param0, "bias": _param1})
+    if verbose:
+        print("INIT pooled_fc")
+    _param0 = load_numpy_weight("pre_encoder_layer_norm_scale")
+    _param1 = load_numpy_weight("pre_encoder_layer_norm_bias")
+    backbone_model.pre_process_layer._sub_layers["layer_norm_0"].set_dict({
+        "weight": _param0,
+        "bias": _param1
+    })
+    if verbose:
+        print("INIT pre_encoder layer norm")
+    for _i in range(bert_config["num_hidden_layers"]):
+        _param_weight = "encoder_layer_%d_multi_head_att_query_fc.w_0" % _i
+        _param_bias = "encoder_layer_%d_multi_head_att_query_fc.b_0" % _i
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+        backbone_model._encoder._sub_layers["layer_%d" %
+                                            _i].self_attn.q_fc.set_dict({
+                                                "weight": _param_weight,
+                                                "bias": _param_bias
+                                            })
+        if verbose:
+            print("INIT multi_head_att_query_fc %d" % _i)
+        _param_weight = "encoder_layer_%d_multi_head_att_key_fc.w_0" % _i
+        _param_bias = "encoder_layer_%d_multi_head_att_key_fc.b_0" % _i
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+        backbone_model._encoder._sub_layers["layer_%d" %
+                                            _i].self_attn.k_fc.set_dict({
+                                                "weight": _param_weight,
+                                                "bias": _param_bias
+                                            })
+        if verbose:
+            print("INIT multi_head_att_key_fc %d" % _i)
+        _param_weight = "encoder_layer_%d_multi_head_att_value_fc.w_0" % _i
+        _param_bias = "encoder_layer_%d_multi_head_att_value_fc.b_0" % _i
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+        backbone_model._encoder._sub_layers["layer_%d" %
+                                            _i].self_attn.v_fc.set_dict({
+                                                "weight": _param_weight,
+                                                "bias": _param_bias
+                                            })
+        if verbose:
+            print("INIT multi_head_att_value_fc %d" % _i)
+        # init output fc
+        _param_weight = "encoder_layer_%d_multi_head_att_output_fc.w_0" % _i
+        _param_bias = "encoder_layer_%d_multi_head_att_output_fc.b_0" % _i
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+        backbone_model._encoder._sub_layers["layer_%d" %
+                                            _i].self_attn.proj_fc.set_dict({
+                                                "weight": _param_weight,
+                                                "bias": _param_bias
+                                            })
+        if verbose:
+            print("INIT multi_head_att_output_fc %d" % _i)
+        # init layer_norm 1
+        _param_weight = "encoder_layer_%d_post_att_layer_norm_scale" % _i
+        _param_bias = "encoder_layer_%d_post_att_layer_norm_bias" % _i
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+        backbone_model._encoder._sub_layers[
+            "layer_%d" % _i].postprocesser1.layer_norm_0.set_dict({
+                "weight": _param_weight,
+                "bias": _param_bias
+            })
+        if verbose:
+            print("INIT layer norm in attention at %d layer" % _i)
+        # init layer_norm 2
+        _param_weight = "encoder_layer_%d_post_ffn_layer_norm_scale" % _i
+        _param_bias = "encoder_layer_%d_post_ffn_layer_norm_bias" % _i
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+        backbone_model._encoder._sub_layers[
+            "layer_%d" % _i].postprocesser2.layer_norm_0.set_dict({
+                "weight": _param_weight,
+                "bias": _param_bias
+            })
+        if verbose:
+            print("INIT layer norm in FFN at %d layer" % _i)
+        # init FFN 1
+        _param_weight = "encoder_layer_%d_ffn_fc_0.w_0" % _i
+        _param_bias = "encoder_layer_%d_ffn_fc_0.b_0" % _i
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+        backbone_model._encoder._sub_layers["layer_%d" % _i].ffn.fc1.set_dict({
+            "weight": _param_weight,
+            "bias": _param_bias
+        })
+        if verbose:
+            print("INIT FFN-1 at %d layer" % _i)
+        # init FFN 2
+        _param_weight = "encoder_layer_%d_ffn_fc_1.w_0" % _i
+        _param_bias = "encoder_layer_%d_ffn_fc_1.b_0" % _i
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+        backbone_model._encoder._sub_layers["layer_%d" % _i].ffn.fc2.set_dict({
+            "weight": _param_weight,
+            "bias": _param_bias
+        })
+        if verbose:
+            print("INIT FFN-2 at %d layer" % _i)
+    return True
--- a/hapi/text/text.py
+++ b/hapi/text/text.py
--- a/hapi/text/tokenizer/__init__.py
+++ b/hapi/text/tokenizer/__init__.py
--- a/hapi/text/tokenizer/tokenization.py
+++ b/hapi/text/tokenizer/tokenization.py
--- a/setup.cfg
+++ b/setup.cfg
+[metadata]
+name = hapi
+author = zhouxiangyang
+author_email = zhouxiangyang@baidu.com
+version = 0.0.1
+description = HAPI
+long_description = file: README.md
+long_description_content_type = text/markdown
+home_page = https://github.com/PaddlePaddle/hapi
+license = Apache 2.0
+classifier =
+    Private :: Do Not Upload
+    Programming Language :: Python
+    Programming Language :: Python :: 2
+    Programming Language :: Python :: 2.7
+    Programming Language :: Python :: 3
+    Programming Language :: Python :: 3.5
+    Programming Language :: Python :: 3.6
+    Programming Language :: Python :: 3.7
+keywords =
+    paddlepaddle
+    paddle
+    high-level-api
+[options]
+packages = find:
+#install_requires =
+#    paddlepaddle-gpu >= 1.5.2
+include_package_data = True
+zip_safe = False
+[sdist]
+dist_dir = output/dist
+[bdist_wheel]
+dist_dir = output/dist
+[easy_install]
+index_url = http://pip.baidu.com/root/baidu/+simple/
--- a/setup.py
+++ b/setup.py
+# -*- coding: UTF-8 -*-
+################################################################################
+#
+#   Copyright (c) 2019  Baidu.com, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+"""
+Setup script.
+Authors: zhouxiangyang(zhouxiangyang@baidu.com)
+Date:    2020/2/4 00:00:01
+"""
+import setuptools
+with open("README.md", "r") as fh:
+    long_description = fh.read()
+setuptools.setup(
+    name="hapi",
+    version="0.0.1",
+    author="PaddlePaddle",
+    author_email="zhouxiangyang@baidu.com",
+    description="A Paddle High-level API that supports both static and dynamic execution modes (still under development)",
+    # long_description=long_description,
+    # long_description_content_type="text/markdown",
+    url="https://github.com/PaddlePaddle/hapi",
+    # packages=setuptools.find_packages(),
+    packages=[
+        'hapi', 'hapi.text', 'hapi.text.tokenizer', 'hapi.text.bert',
+        'hapi.text.bert.utils'
+    ],
+    package_dir={
+        'hapi': './hapi',
+        'hapi.text': './hapi/text',
+        'hapi.text.tokenizer': './hapi/text/tokenizer',
+        'hapi.text.bert': './hapi/text/bert',
+        'hapi.text.bert.utils': './hapi/text/bert/utils',
+    },
+    platforms="any",
+    license='Apache 2.0',
+    classifiers=[
+        'License :: OSI Approved :: Apache Software License',
+        'Programming Language :: Python',
+        'Programming Language :: Python :: 2',
+        'Programming Language :: Python :: 2.7',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
+    ], )
--- a/tests/test_bert_dataloader.py
+++ b/tests/test_bert_dataloader.py
+import paddle
+from hapi.model import set_device
+from hapi.text.bert.dataloader import SingleSentenceDataLoader
+import hapi.text.tokenizer.tokenization as tokenization
+device = set_device("cpu")
+paddle.fluid.enable_dygraph(device)
+tokenizer = tokenization.FullTokenizer(
+    vocab_file="./tmp/hapi/data/pretrained_models/uncased_L-12_H-768_A-12/vocab.txt",
+    do_lower_case=True)
+bert_dataloader = SingleSentenceDataLoader(
+    "./tmp/hapi/aaa.txt",
+    tokenizer, ["1", "2"],
+    max_seq_length=32,
+    batch_size=1)
+for data in bert_dataloader.dataloader():
+    print(data)