Merge branch 'master' of https://github.com/PaddlePaddle/hapi into add-hapi-seq2seq

38fd12ef · guosheng · 12fb5614 · 072bedd1 · 38fd12ef · 38fd12ef
154 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,6 @@
 *.json
 output*
 *checkpoint*
+build
+dist
+hapi.egg-info
--- a/examples/bert/bert.yaml
+++ b/examples/bert/bert.yaml
+bert_config_path: "./config/bert_config.json"
+init_checkpoint: None
+init_pretraining_params: None
+checkpoints: "./saved_model"
+epoch: 3
+learning_rate: 0.0001
+lr_scheduler: "linear_warmup_decay"
+weight_decay: 0.01
+warmup_proportion: 0.1
+save_steps: 100000
+validation_steps: 100000
+loss_scaling: 1.0
+skip_steps: 100
+data_dir: None
+vocab_path: None
+max_seq_len: 512
+batch_size: 32
+in_tokens: False
+do_lower_case: True
+random_seed: 5512
+use_cuda: True
+shuffle: True
+do_train: True
+do_test: True
+use_data_parallel: False
+verbose: False
--- a/examples/bert/bert_classifier.py
+++ b/examples/bert/bert_classifier.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT fine-tuning in Paddle Dygraph Mode."""
+import paddle.fluid as fluid
+from hapi.metrics import Accuracy
+from hapi.configure import Config
+from hapi.text.bert import BertEncoder
+from paddle.fluid.dygraph import Linear, Layer
+from hapi.model import set_device, Model, SoftmaxWithCrossEntropy, Input
+import hapi.text.tokenizer.tokenization as tokenization
+from hapi.text.bert import Optimizer, BertConfig, BertDataLoader, BertInputExample
+class ClsModelLayer(Model):
+    """
+    classify model
+    """
+    def __init__(self,
+                 args,
+                 config,
+                 num_labels,
+                 return_pooled_out=True,
+                 use_fp16=False):
+        super(ClsModelLayer, self).__init__()
+        self.config = config
+        self.use_fp16 = use_fp16
+        self.loss_scaling = args.loss_scaling
+        self.bert_layer = BertEncoder(
+            config=self.config, return_pooled_out=True, use_fp16=self.use_fp16)
+        self.cls_fc = Linear(
+            input_dim=self.config["hidden_size"],
+            output_dim=num_labels,
+            param_attr=fluid.ParamAttr(
+                name="cls_out_w",
+                initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
+            bias_attr=fluid.ParamAttr(
+                name="cls_out_b", initializer=fluid.initializer.Constant(0.)))
+    def forward(self, src_ids, position_ids, sentence_ids, input_mask):
+        """
+        forward
+        """
+        enc_output, next_sent_feat = self.bert_layer(src_ids, position_ids,
+                                                     sentence_ids, input_mask)
+        cls_feats = fluid.layers.dropout(
+            x=next_sent_feat,
+            dropout_prob=0.1,
+            dropout_implementation="upscale_in_train")
+        pred = self.cls_fc(cls_feats)
+        return pred
+def main():
+    config = Config(yaml_file="./bert.yaml")
+    config.build()
+    config.Print()
+    device = set_device("gpu" if config.use_cuda else "cpu")
+    fluid.enable_dygraph(device)
+    bert_config = BertConfig(config.bert_config_path)
+    bert_config.print_config()
+    tokenizer = tokenization.FullTokenizer(
+        vocab_file=config.vocab_path, do_lower_case=config.do_lower_case)
+    def mnli_line_processor(line_id, line):
+        if line_id == "0":
+            return None
+        uid = tokenization.convert_to_unicode(line[0])
+        text_a = tokenization.convert_to_unicode(line[8])
+        text_b = tokenization.convert_to_unicode(line[9])
+        label = tokenization.convert_to_unicode(line[-1])
+        if label not in ["contradiction", "entailment", "neutral"]:
+            label = "contradiction"
+        return BertInputExample(
+            uid=uid, text_a=text_a, text_b=text_b, label=label)
+    train_dataloader = BertDataLoader(
+        "./data/glue_data/MNLI/train.tsv",
+        tokenizer, ["contradiction", "entailment", "neutral"],
+        max_seq_length=config.max_seq_len,
+        batch_size=config.batch_size,
+        line_processor=mnli_line_processor)
+    test_dataloader = BertDataLoader(
+        "./data/glue_data/MNLI/dev_matched.tsv",
+        tokenizer, ["contradiction", "entailment", "neutral"],
+        max_seq_length=config.max_seq_len,
+        batch_size=config.batch_size,
+        line_processor=mnli_line_processor,
+        shuffle=False,
+        phase="predict")
+    trainer_count = fluid.dygraph.parallel.Env().nranks
+    num_train_examples = len(train_dataloader.dataset)
+    max_train_steps = config.epoch * num_train_examples // config.batch_size // trainer_count
+    warmup_steps = int(max_train_steps * config.warmup_proportion)
+    print("Trainer count: %d" % trainer_count)
+    print("Num train examples: %d" % num_train_examples)
+    print("Max train steps: %d" % max_train_steps)
+    print("Num warmup steps: %d" % warmup_steps)
+    inputs = [
+        Input(
+            [None, None], 'int64', name='src_ids'), Input(
+                [None, None], 'int64', name='pos_ids'), Input(
+                    [None, None], 'int64', name='sent_ids'), Input(
+                        [None, None], 'float32', name='input_mask')
+    ]
+    labels = [Input([None, 1], 'int64', name='label')]
+    cls_model = ClsModelLayer(
+        config,
+        bert_config,
+        len(["contradiction", "entailment", "neutral"]),
+        return_pooled_out=True)
+    optimizer = Optimizer(
+        warmup_steps=warmup_steps,
+        num_train_steps=max_train_steps,
+        learning_rate=config.learning_rate,
+        model_cls=cls_model,
+        weight_decay=config.weight_decay,
+        scheduler=config.lr_scheduler,
+        loss_scaling=config.loss_scaling,
+        parameter_list=cls_model.parameters())
+    cls_model.prepare(
+        optimizer,
+        SoftmaxWithCrossEntropy(),
+        Accuracy(topk=(1, 2)),
+        inputs,
+        labels,
+        device=device)
+    cls_model.bert_layer.init_parameters(
+        config.init_pretraining_params, verbose=config.verbose)
+    # do train
+    cls_model.fit(train_data=train_dataloader.dataloader,
+                  epochs=config.epoch,
+                  save_dir=config.checkpoints)
+    # do eval
+    cls_model.evaluate(
+        eval_data=test_dataloader.dataloader, batch_size=config.batch_size)
+if __name__ == '__main__':
+    main()
--- a/examples/bert/run_classifier_single_gpu.sh
+++ b/examples/bert/run_classifier_single_gpu.sh
+#!/bin/bash
+BERT_BASE_PATH="./data/pretrained_models/uncased_L-12_H-768_A-12/"
+TASK_NAME='MNLI'
+DATA_PATH="./data/glue_data/MNLI/"
+CKPT_PATH="./data/saved_model/mnli_models"
+export CUDA_VISIBLE_DEVICES=0
+# start fine-tuning
+python3.7 bert_classifier.py\
+    --use_cuda true \
+    --do_train true \
+    --do_test true \
+    --batch_size 64 \
+    --init_pretraining_params ${BERT_BASE_PATH}/dygraph_params/ \
+    --data_dir ${DATA_PATH} \
+    --vocab_path ${BERT_BASE_PATH}/vocab.txt \
+    --checkpoints ${CKPT_PATH} \
+    --save_steps 1000 \
+    --weight_decay  0.01 \
+    --warmup_proportion 0.1 \
+    --validation_steps 100 \
+    --epoch 3 \
+    --max_seq_len 128 \
+    --bert_config_path ${BERT_BASE_PATH}/bert_config.json \
+    --learning_rate 5e-5 \
+    --skip_steps 10 \
+    --shuffle true
--- a/examples/bert_leveldb/bert.yaml
+++ b/examples/bert_leveldb/bert.yaml
+bert_config_path: "./config/bert_config.json"
+init_checkpoint: None
+init_pretraining_params: None
+checkpoints: "./saved_model"
+epoch: 3
+learning_rate: 0.0001
+lr_scheduler: "linear_warmup_decay"
+weight_decay: 0.01
+warmup_proportion: 0.1
+save_steps: 100000
+validation_steps: 100000
+loss_scaling: 1.0
+skip_steps: 100
+data_dir: None
+vocab_path: None
+max_seq_len: 512
+batch_size: 32
+in_tokens: False
+do_lower_case: True
+random_seed: 5512
+use_cuda: True
+shuffle: True
+do_train: True
+do_test: True
+use_data_parallel: False
+verbose: False
--- a/examples/bert_leveldb/bert_classifier.py
+++ b/examples/bert_leveldb/bert_classifier.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT fine-tuning in Paddle Dygraph Mode."""
+import paddle.fluid as fluid
+from hapi.metrics import Accuracy
+from hapi.configure import Config
+from hapi.text.bert import BertEncoder
+from paddle.fluid.dygraph import Linear, Layer
+from hapi.model import set_device, Model, SoftmaxWithCrossEntropy, Input
+import hapi.text.tokenizer.tokenization as tokenization
+from hapi.text.bert import Optimizer, BertConfig, BertDataLoader, BertInputExample
+class ClsModelLayer(Model):
+    """
+    classify model
+    """
+    def __init__(self,
+                 args,
+                 config,
+                 num_labels,
+                 return_pooled_out=True,
+                 use_fp16=False):
+        super(ClsModelLayer, self).__init__()
+        self.config = config
+        self.use_fp16 = use_fp16
+        self.loss_scaling = args.loss_scaling
+        self.bert_layer = BertEncoder(
+            config=self.config, return_pooled_out=True, use_fp16=self.use_fp16)
+        self.cls_fc = Linear(
+            input_dim=self.config["hidden_size"],
+            output_dim=num_labels,
+            param_attr=fluid.ParamAttr(
+                name="cls_out_w",
+                initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
+            bias_attr=fluid.ParamAttr(
+                name="cls_out_b", initializer=fluid.initializer.Constant(0.)))
+    def forward(self, src_ids, position_ids, sentence_ids, input_mask):
+        """
+        forward
+        """
+        enc_output, next_sent_feat = self.bert_layer(src_ids, position_ids,
+                                                     sentence_ids, input_mask)
+        cls_feats = fluid.layers.dropout(
+            x=next_sent_feat,
+            dropout_prob=0.1,
+            dropout_implementation="upscale_in_train")
+        pred = self.cls_fc(cls_feats)
+        return pred
+def main():
+    config = Config(yaml_file="./bert.yaml")
+    config.build()
+    config.Print()
+    device = set_device("gpu" if config.use_cuda else "cpu")
+    fluid.enable_dygraph(device)
+    bert_config = BertConfig(config.bert_config_path)
+    bert_config.print_config()
+    tokenizer = tokenization.FullTokenizer(
+        vocab_file=config.vocab_path, do_lower_case=config.do_lower_case)
+    def mnli_line_processor(line_id, line):
+        if line_id == "0":
+            return None
+        uid = tokenization.convert_to_unicode(line[0])
+        text_a = tokenization.convert_to_unicode(line[8])
+        text_b = tokenization.convert_to_unicode(line[9])
+        label = tokenization.convert_to_unicode(line[-1])
+        if label not in ["contradiction", "entailment", "neutral"]:
+            label = "contradiction"
+        return BertInputExample(
+            uid=uid, text_a=text_a, text_b=text_b, label=label)
+    train_dataloader = BertDataLoader(
+        "./data/glue_data/MNLI/train.tsv",
+        tokenizer, ["contradiction", "entailment", "neutral"],
+        max_seq_length=config.max_seq_len,
+        batch_size=config.batch_size,
+        line_processor=mnli_line_processor,
+        mode="leveldb",
+        phase="train")
+    test_dataloader = BertDataLoader(
+        "./data/glue_data/MNLI/dev_matched.tsv",
+        tokenizer, ["contradiction", "entailment", "neutral"],
+        max_seq_length=config.max_seq_len,
+        batch_size=config.batch_size,
+        line_processor=mnli_line_processor,
+        shuffle=False,
+        phase="predict")
+    trainer_count = fluid.dygraph.parallel.Env().nranks
+    num_train_examples = len(train_dataloader.dataset)
+    max_train_steps = config.epoch * num_train_examples // config.batch_size // trainer_count
+    warmup_steps = int(max_train_steps * config.warmup_proportion)
+    print("Trainer count: %d" % trainer_count)
+    print("Num train examples: %d" % num_train_examples)
+    print("Max train steps: %d" % max_train_steps)
+    print("Num warmup steps: %d" % warmup_steps)
+    inputs = [
+        Input(
+            [None, None], 'int64', name='src_ids'), Input(
+                [None, None], 'int64', name='pos_ids'), Input(
+                    [None, None], 'int64', name='sent_ids'), Input(
+                        [None, None], 'float32', name='input_mask')
+    ]
+    labels = [Input([None, 1], 'int64', name='label')]
+    cls_model = ClsModelLayer(
+        config,
+        bert_config,
+        len(["contradiction", "entailment", "neutral"]),
+        return_pooled_out=True)
+    optimizer = Optimizer(
+        warmup_steps=warmup_steps,
+        num_train_steps=max_train_steps,
+        learning_rate=config.learning_rate,
+        model_cls=cls_model,
+        weight_decay=config.weight_decay,
+        scheduler=config.lr_scheduler,
+        loss_scaling=config.loss_scaling,
+        parameter_list=cls_model.parameters())
+    cls_model.prepare(
+        optimizer,
+        SoftmaxWithCrossEntropy(),
+        Accuracy(topk=(1, 2)),
+        inputs,
+        labels,
+        device=device)
+    cls_model.bert_layer.init_parameters(
+        config.init_pretraining_params, verbose=config.verbose)
+    # do train
+    cls_model.fit(train_data=train_dataloader.dataloader,
+                  epochs=config.epoch,
+                  save_dir=config.checkpoints)
+    # do eval
+    cls_model.evaluate(
+        eval_data=test_dataloader.dataloader, batch_size=config.batch_size)
+if __name__ == '__main__':
+    main()
--- a/examples/bert_leveldb/run_classifier_multi_gpu.sh
+++ b/examples/bert_leveldb/run_classifier_multi_gpu.sh
+#!/bin/bash
+BERT_BASE_PATH="./data/pretrained_models/uncased_L-12_H-768_A-12/"
+TASK_NAME='MNLI'
+DATA_PATH="./data/glue_data/MNLI/"
+CKPT_PATH="./data/saved_model/mnli_models"
+# start fine-tuning
+python3.7 -m paddle.distributed.launch --started_port 8899 --selected_gpus=0,1,2,3 bert_classifier.py\
+    --use_cuda true \
+    --do_train true \
+    --do_test true \
+    --batch_size 64 \
+    --init_pretraining_params ${BERT_BASE_PATH}/dygraph_params/ \
+    --data_dir ${DATA_PATH} \
+    --vocab_path ${BERT_BASE_PATH}/vocab.txt \
+    --checkpoints ${CKPT_PATH} \
+    --save_steps 1000 \
+    --weight_decay  0.01 \
+    --warmup_proportion 0.1 \
+    --validation_steps 100 \
+    --epoch 3 \
+    --max_seq_len 128 \
+    --bert_config_path ${BERT_BASE_PATH}/bert_config.json \
+    --learning_rate 5e-5 \
+    --skip_steps 10 \
+    --shuffle true
--- a/examples/bert_leveldb/run_classifier_single_gpu.sh
+++ b/examples/bert_leveldb/run_classifier_single_gpu.sh
+#!/bin/bash
+BERT_BASE_PATH="./data/pretrained_models/uncased_L-12_H-768_A-12/"
+TASK_NAME='MNLI'
+DATA_PATH="./data/glue_data/MNLI/"
+CKPT_PATH="./data/saved_model/mnli_models"
+export CUDA_VISIBLE_DEVICES=0
+# start fine-tuning
+python3.7 bert_classifier.py\
+    --use_cuda true \
+    --do_train true \
+    --do_test true \
+    --batch_size 64 \
+    --init_pretraining_params ${BERT_BASE_PATH}/dygraph_params/ \
+    --data_dir ${DATA_PATH} \
+    --vocab_path ${BERT_BASE_PATH}/vocab.txt \
+    --checkpoints ${CKPT_PATH} \
+    --save_steps 1000 \
+    --weight_decay  0.01 \
+    --warmup_proportion 0.1 \
+    --validation_steps 100 \
+    --epoch 3 \
+    --max_seq_len 128 \
+    --bert_config_path ${BERT_BASE_PATH}/bert_config.json \
+    --learning_rate 5e-5 \
+    --skip_steps 10 \
+    --shuffle true
--- a/bmn/BMN.png
+++ b/bmn/BMN.png
--- a/bmn/README.md
+++ b/bmn/README.md
@@ -29,7 +29,6 @@ BMN Overview
 ├── train.py           # 训练代码，训练网络
 ├── eval.py            # 评估代码，评估网络性能
 ├── predict.py         # 预测代码，针对任意输入预测结果
-├── bmn_model.py       # 网络结构与损失函数定义
 ├── bmn_metric.py      # 精度评估方法定义
 ├── reader.py          # 数据reader，构造Dataset和Dataloader
 ├── bmn_utils.py       # 模型细节相关代码
@@ -41,7 +40,7 @@ BMN Overview
 ## 数据准备
-BMN的训练数据采用ActivityNet1.3提供的数据集，我们提供了处理好的视频特征，请下载[bmn\_feat](https://paddlemodels.bj.bcebos.com/video_detection/bmn_feat.tar.gz)数据后解压，同时相应的修改bmn.yaml中的特征路径feat\_path。对应的标签文件请下载[label](https://paddlemodels.bj.bcebos.com/video_detection/activitynet_1.3_annotations.json)并修改bmn.yaml中的标签文件路径anno\_file。
+BMN的训练数据采用ActivityNet1.3提供的数据集，我们提供了处理好的视频特征和对应的标签文件，请下载特征数据[bmn\_feat](https://paddlemodels.bj.bcebos.com/video_detection/bmn_feat.tar.gz)和标签数据[label](https://paddlemodels.bj.bcebos.com/video_detection/activitynet_1.3_annotations.json)，并相应地修改配置文件bmn.yaml中的特征文件路径feat\_path和标签文件路径anno\_file。
 ## 模型训练
@@ -52,22 +51,17 @@ BMN的训练数据采用ActivityNet1.3提供的数据集，我们提供了处理
    bash run.sh
-若使用单卡训练，启动方式如下:
+若使用单卡训练，请将配置文件bmn.yaml中的batch\_size调整为16，启动方式如下:
-    export CUDA_VISIBLE_DEVICES=0
    python train.py
- 代码运行需要先安装pandas
+默认使用静态图训练，若使用动态图训练只需要在运行脚本添加`-d`参数即可，如：
- 从头开始训练，使用上述启动命令行或者脚本程序即可启动训练，不需要用到预训练模型
- 单卡训练时，请将配置文件中的batch_size调整为16
+    python train.py -d
-**训练策略：**
+- 代码运行需要先安装pandas
-*  采用Adam优化器，初始learning\_rate=0.001
+- 从头开始训练，使用上述启动命令行或者脚本程序即可启动训练，不需要用到预训练模型
-*  权重衰减系数为1e-4
-*  学习率在迭代次数达到4200的时候做一次衰减，衰减系数为0.1
 ## 模型评估
@@ -76,9 +70,9 @@ BMN的训练数据采用ActivityNet1.3提供的数据集，我们提供了处理
    python eval.py --weights=$PATH_TO_WEIGHTS
- 进行评估时，可修改命令行中的`weights`参数指定需要评估的权重，如果不设置，将使用默认参数文件checkpoint/final.pdparams。
+- 进行评估时，可修改命令行中的`weights`参数指定需要评估的权重，若未指定，脚本会下载已发布的模型[model](https://paddlemodels.bj.bcebos.com/hapi/bmn.pdparams)进行评估。
- 上述程序会将运行结果保存在output/EVAL/BMN\_results文件夹下，测试结果保存在evaluate\_results/bmn\_results\_validation.json文件中。
+- 上述程序会将运行结果保存在`--output_path`参数指定的文件夹下，默认为output/EVAL/BMN\_results；测试结果保存在`--result_path`参数指定的文件夹下，默认为evaluate\_results。
 - 注：评估时可能会出现loss为nan的情况。这是由于评估时用的是单个样本，可能存在没有iou>0.6的样本，所以为nan，对最终的评估结果没有影响。
@@ -87,9 +81,9 @@ BMN的训练数据采用ActivityNet1.3提供的数据集，我们提供了处理
 - ActivityNet数据集的具体使用说明可以参考其[官方网站](http://activity-net.org)
- 下载指标评估代码，请从[ActivityNet Gitub repository](https://github.com/activitynet/ActivityNet.git)下载，将Evaluation文件夹拷贝至models/dygraph/bmn目录下。(注：由于第三方评估代码不支持python3，此处建议使用python2进行评估；若使用python3，print函数需要添加括号，请对Evaluation目录下的.py文件做相应修改。)
+- 下载指标评估代码，请从[ActivityNet Gitub repository](https://github.com/activitynet/ActivityNet.git)下载，将Evaluation文件夹拷贝至hapi/examples/bmn目录下。(注：由于第三方评估代码不支持python3，此处建议使用python2进行评估；若使用python3，print函数需要添加括号，请对Evaluation目录下的.py文件做相应修改。)
- 请下载[activity\_net\_1\_3\_new.json](https://paddlemodels.bj.bcebos.com/video_detection/activity_net_1_3_new.json)文件，并将其放置在models/dygraph/bmn/Evaluation/data目录下，相较于原始的activity\_net.v1-3.min.json文件，我们过滤了其中一些失效的视频条目。
+- 请下载[activity\_net\_1\_3\_new.json](https://paddlemodels.bj.bcebos.com/video_detection/activity_net_1_3_new.json)文件，并将其放置在hapi/examples/bmn/Evaluation/data目录下，相较于原始的activity\_net.v1-3.min.json文件，我们过滤了其中一些失效的视频条目。
 - 计算精度指标
@@ -100,7 +94,7 @@ BMN的训练数据采用ActivityNet1.3提供的数据集，我们提供了处理
 | AR@1 | AR@5 | AR@10 | AR@100 | AUC |
 | :---: | :---: | :---: | :---: | :---: |
-| 33.46 | 49.25 | 56.25 | 75.40 | 67.16% |
+| 33.10 | 49.18 | 56.54 | 75.12 | 67.16% |
 ## 模型推断
@@ -110,9 +104,9 @@ BMN的训练数据采用ActivityNet1.3提供的数据集，我们提供了处理
    python predict.py --weights=$PATH_TO_WEIGHTS \
                      --filelist=$FILELIST
- 使用python命令行启动程序时，`--filelist`参数指定待推断的文件列表，如果不设置，默认为./infer.list。`--weights`参数为训练好的权重参数，如果不设置，将使用默认参数文件checkpoint/final.pdparams。
+- 使用python命令行启动程序时，`--filelist`参数指定待推断的文件列表，如果不设置，默认为./infer.list。`--weights`参数为训练好的权重参数，若未指定，脚本会下载已发布的模型[model](https://paddlemodels.bj.bcebos.com/hapi/bmn.pdparams)进行预测。
- 上述程序会将运行结果保存在output/INFER/BMN\_results文件夹下，测试结果保存在predict\_results/bmn\_results\_test.json文件中。
+- 上述程序会将运行结果保存在`--output_path`参数指定的文件夹下，默认为output/INFER/BMN\_results；测试结果保存在`--result_path`参数指定的文件夹下，默认为predict\_results。
 ## 参考论文

--- a/bmn/bmn.yaml
+++ b/bmn/bmn.yaml
@@ -12,11 +12,10 @@ MODEL:
 TRAIN:
  subset: "train"
  epoch: 9
-  batch_size: 4
+  batch_size: 4 
  num_workers: 4
  use_shuffle: True
  device: "gpu"
-  num_gpus: 4
  learning_rate: 0.001
  learning_rate_decay: 0.1
  lr_decay_iter: 4200
@@ -29,10 +28,6 @@ TEST:
  subset: "validation"
  batch_size: 1
  num_workers: 1
-  use_buffer: False
-  snms_alpha: 0.001
-  snms_t1: 0.5
-  snms_t2: 0.9
  output_path: "output/EVAL/BMN_results"
  result_path: "evaluate_results"
@@ -40,10 +35,6 @@ INFER:
  subset: "test"
  batch_size: 1
  num_workers: 1
-  use_buffer: False
-  snms_alpha: 0.4
-  snms_t1: 0.5
-  snms_t2: 0.9
  filelist: './infer.list'
  output_path: "output/INFER/BMN_results"
  result_path: "predict_results"

--- a/bmn/bmn_metric.py
+++ b/bmn/bmn_metric.py
@@ -20,7 +20,7 @@ import json
 sys.path.append('../')
-from metrics import Metric
+from hapi.metrics import Metric
 from bmn_utils import boundary_choose, bmn_post_processing
@@ -36,13 +36,26 @@ class BmnMetric(Metric):
        #get video_dict and video_list
        if self.mode == 'test':
            self.get_test_dataset_dict()
+            if not os.path.isdir(self.cfg.TEST.output_path):
+                os.makedirs(self.cfg.TEST.output_path)
+            if not os.path.isdir(self.cfg.TEST.result_path):
+                os.makedirs(self.cfg.TEST.result_path)
        elif self.mode == 'infer':
            self.get_infer_dataset_dict()
+            if not os.path.isdir(self.cfg.INFER.output_path):
+                os.makedirs(self.cfg.INFER.output_path)
+            if not os.path.isdir(self.cfg.INFER.result_path):
+                os.makedirs(self.cfg.INFER.result_path)
-    def add_metric_op(self, preds, label):
+    def add_metric_op(self, *args):
-        pred_bm, pred_start, pred_en = preds
+        if self.mode == 'test':
-        video_index = label[-1]
+            # only extract pred_bm, pred_start, pred_en from outputs
-        return [pred_bm, pred_start, pred_en, video_index]  #return list
+            # and video_index from label here
+            pred_bm, pred_start, pred_en, _, _, _, video_index = args
+        else:
+            # in infer mode, labels only contains video_index
+            pred_bm, pred_start, pred_en, video_index = args
+        return pred_bm, pred_start, pred_en, video_index
    def update(self, pred_bm, pred_start, pred_end, fid):
        # generate proposals

--- a/bmn/bmn_utils.py
+++ b/bmn/bmn_utils.py
@@ -162,56 +162,3 @@ def bmn_post_processing(video_dict, subset, output_path, result_path):
    outfile.close()
-def _get_interp1d_bin_mask(seg_xmin, seg_xmax, tscale, num_sample,
-                           num_sample_perbin):
-    """ generate sample mask for a boundary-matching pair """
-    plen = float(seg_xmax - seg_xmin)
-    plen_sample = plen / (num_sample * num_sample_perbin - 1.0)
-    total_samples = [
-        seg_xmin + plen_sample * ii
-        for ii in range(num_sample * num_sample_perbin)
-    ]
-    p_mask = []
-    for idx in range(num_sample):
-        bin_samples = total_samples[idx * num_sample_perbin:(idx + 1) *
-                                    num_sample_perbin]
-        bin_vector = np.zeros([tscale])
-        for sample in bin_samples:
-            sample_upper = math.ceil(sample)
-            sample_decimal, sample_down = math.modf(sample)
-            if int(sample_down) <= (tscale - 1) and int(sample_down) >= 0:
-                bin_vector[int(sample_down)] += 1 - sample_decimal
-            if int(sample_upper) <= (tscale - 1) and int(sample_upper) >= 0:
-                bin_vector[int(sample_upper)] += sample_decimal
-        bin_vector = 1.0 / num_sample_perbin * bin_vector
-        p_mask.append(bin_vector)
-    p_mask = np.stack(p_mask, axis=1)
-    return p_mask
-def get_interp1d_mask(tscale, dscale, prop_boundary_ratio, num_sample,
-                      num_sample_perbin):
-    """ generate sample mask for each point in Boundary-Matching Map """
-    mask_mat = []
-    for start_index in range(tscale):
-        mask_mat_vector = []
-        for duration_index in range(dscale):
-            if start_index + duration_index < tscale:
-                p_xmin = start_index
-                p_xmax = start_index + duration_index
-                center_len = float(p_xmax - p_xmin) + 1
-                sample_xmin = p_xmin - center_len * prop_boundary_ratio
-                sample_xmax = p_xmax + center_len * prop_boundary_ratio
-                p_mask = _get_interp1d_bin_mask(sample_xmin, sample_xmax,
-                                                tscale, num_sample,
-                                                num_sample_perbin)
-            else:
-                p_mask = np.zeros([tscale, num_sample])
-            mask_mat_vector.append(p_mask)
-        mask_mat_vector = np.stack(mask_mat_vector, axis=2)
-        mask_mat.append(mask_mat_vector)
-    mask_mat = np.stack(mask_mat, axis=3)
-    mask_mat = mask_mat.astype(np.float32)
-    sample_mask = np.reshape(mask_mat, [tscale, -1])
-    return sample_mask
--- a/bmn/config_utils.py
+++ b/bmn/config_utils.py
--- a/bmn/eval.py
+++ b/bmn/eval.py
@@ -18,11 +18,10 @@ import sys
 import logging
 import paddle.fluid as fluid
-sys.path.append('../')
+from hapi.model import set_device, Input
-from model import set_device, Input
+from modeling import bmn, BmnLoss
 from bmn_metric import BmnMetric
-from bmn_model import BMN, BmnLoss
 from reader import BmnDataset
 from config_utils import *
@@ -39,7 +38,6 @@ def parse_args():
    parser.add_argument(
        "-d",
        "--dynamic",
-        default=True,
        action='store_true',
        help="enable dygraph mode, only support dynamic mode at present time")
    parser.add_argument(
@@ -55,9 +53,20 @@ def parse_args():
    parser.add_argument(
        '--weights',
        type=str,
-        default="checkpoint/final",
+        default=None,
        help='weight path, None to automatically download weights provided by Paddle.'
    )
+    parser.add_argument(
+        '--output_path',
+        type=str,
+        default="output/EVAL/BMN_results",
+        help='output dir path, default to use output/EVAL/BMN_results')
+    parser.add_argument(
+        '--result_path',
+        type=str,
+        default="evaluate_results/",
+        help='output dir path after post processing, default to use ./evaluate_results/'
+    )
    parser.add_argument(
        '--log_interval',
        type=int,
@@ -69,17 +78,21 @@ def parse_args():
 # Performance Evaluation
 def test_bmn(args):
-    # only support dynamic mode at present time
    device = set_device(args.device)
    fluid.enable_dygraph(device) if args.dynamic else None
+    #config setting
    config = parse_config(args.config_file)
    eval_cfg = merge_configs(config, 'test', vars(args))
-    if not os.path.isdir(config.TEST.output_path):
-        os.makedirs(config.TEST.output_path)
-    if not os.path.isdir(config.TEST.result_path):
-        os.makedirs(config.TEST.result_path)
+    feat_dim = config.MODEL.feat_dim
+    tscale = config.MODEL.tscale
+    dscale = config.MODEL.dscale
+    prop_boundary_ratio = config.MODEL.prop_boundary_ratio
+    num_sample = config.MODEL.num_sample
+    num_sample_perbin = config.MODEL.num_sample_perbin
+    #input and video index
    inputs = [
        Input(
            [None, config.MODEL.feat_dim, config.MODEL.tscale],
@@ -99,9 +112,14 @@ def test_bmn(args):
    eval_dataset = BmnDataset(eval_cfg, 'test')
    #model
-    model = BMN(config, args.dynamic)
+    model = bmn(tscale,
+                dscale,
+                prop_boundary_ratio,
+                num_sample,
+                num_sample_perbin,
+                pretrained=args.weights is None)
    model.prepare(
-        loss_function=BmnLoss(config),
+        loss_function=BmnLoss(tscale, dscale),
        metrics=BmnMetric(
            config, mode='test'),
        inputs=inputs,
@@ -109,11 +127,11 @@ def test_bmn(args):
        device=device)
    #load checkpoint
-    if args.weights:
+    if args.weights is not None:
        assert os.path.exists(args.weights + '.pdparams'), \
            "Given weight dir {} not exist.".format(args.weights)
-    logger.info('load test weights from {}'.format(args.weights))
+        logger.info('load test weights from {}'.format(args.weights))
-    model.load(args.weights)
+        model.load(args.weights)
    model.evaluate(
        eval_data=eval_dataset,

--- a/bmn/eval_anet_prop.py
+++ b/bmn/eval_anet_prop.py
--- a/bmn/infer.list
+++ b/bmn/infer.list
--- a/bmn/bmn_model.py
+++ b/bmn/bmn_model.py
@@ -17,11 +17,73 @@ from paddle.fluid import ParamAttr
 import numpy as np
 import math
-from bmn_utils import get_interp1d_mask
+from hapi.model import Model, Loss
-from model import Model, Loss
+from hapi.download import get_weights_path
+__all__ = ["BMN", "BmnLoss", "bmn"]
 DATATYPE = 'float32'
+pretrain_infos = {
+    'bmn': ('https://paddlemodels.bj.bcebos.com/hapi/bmn.pdparams',
+            'aa84e3386e1fbd117fb96fa572feeb94')
+}
+def _get_interp1d_bin_mask(seg_xmin, seg_xmax, tscale, num_sample,
+                           num_sample_perbin):
+    """ generate sample mask for a boundary-matching pair """
+    plen = float(seg_xmax - seg_xmin)
+    plen_sample = plen / (num_sample * num_sample_perbin - 1.0)
+    total_samples = [
+        seg_xmin + plen_sample * ii
+        for ii in range(num_sample * num_sample_perbin)
+    ]
+    p_mask = []
+    for idx in range(num_sample):
+        bin_samples = total_samples[idx * num_sample_perbin:(idx + 1) *
+                                    num_sample_perbin]
+        bin_vector = np.zeros([tscale])
+        for sample in bin_samples:
+            sample_upper = math.ceil(sample)
+            sample_decimal, sample_down = math.modf(sample)
+            if int(sample_down) <= (tscale - 1) and int(sample_down) >= 0:
+                bin_vector[int(sample_down)] += 1 - sample_decimal
+            if int(sample_upper) <= (tscale - 1) and int(sample_upper) >= 0:
+                bin_vector[int(sample_upper)] += sample_decimal
+        bin_vector = 1.0 / num_sample_perbin * bin_vector
+        p_mask.append(bin_vector)
+    p_mask = np.stack(p_mask, axis=1)
+    return p_mask
+def get_interp1d_mask(tscale, dscale, prop_boundary_ratio, num_sample,
+                      num_sample_perbin):
+    """ generate sample mask for each point in Boundary-Matching Map """
+    mask_mat = []
+    for start_index in range(tscale):
+        mask_mat_vector = []
+        for duration_index in range(dscale):
+            if start_index + duration_index < tscale:
+                p_xmin = start_index
+                p_xmax = start_index + duration_index
+                center_len = float(p_xmax - p_xmin) + 1
+                sample_xmin = p_xmin - center_len * prop_boundary_ratio
+                sample_xmax = p_xmax + center_len * prop_boundary_ratio
+                p_mask = _get_interp1d_bin_mask(sample_xmin, sample_xmax,
+                                                tscale, num_sample,
+                                                num_sample_perbin)
+            else:
+                p_mask = np.zeros([tscale, num_sample])
+            mask_mat_vector.append(p_mask)
+        mask_mat_vector = np.stack(mask_mat_vector, axis=2)
+        mask_mat.append(mask_mat_vector)
+    mask_mat = np.stack(mask_mat, axis=3)
+    mask_mat = mask_mat.astype(np.float32)
+    sample_mask = np.reshape(mask_mat, [tscale, -1])
+    return sample_mask
 # Net
 class Conv1D(fluid.dygraph.Layer):
@@ -64,16 +126,27 @@ class Conv1D(fluid.dygraph.Layer):
 class BMN(Model):
-    def __init__(self, cfg, is_dygraph=True):
+    """BMN model from
+    `"BMN: Boundary-Matching Network for Temporal Action Proposal Generation" <https://arxiv.org/abs/1907.09702>`_
+    Args:
+        tscale (int): sequence length, default 100.
+        dscale (int): max duration length, default 100.
+        prop_boundary_ratio (float): ratio of expanded temporal region in proposal boundary, default 0.5. 
+        num_sample (int): number of samples betweent starting boundary and ending boundary of each propoasl, default 32.
+        num_sample_perbin (int):  number of selected points in each sample, default 3.
+    """
+    def __init__(self, tscale, dscale, prop_boundary_ratio, num_sample,
+                 num_sample_perbin):
        super(BMN, self).__init__()
        #init config
-        self.tscale = cfg.MODEL.tscale
+        self.tscale = tscale
-        self.dscale = cfg.MODEL.dscale
+        self.dscale = dscale
-        self.prop_boundary_ratio = cfg.MODEL.prop_boundary_ratio
+        self.prop_boundary_ratio = prop_boundary_ratio
-        self.num_sample = cfg.MODEL.num_sample
+        self.num_sample = num_sample
-        self.num_sample_perbin = cfg.MODEL.num_sample_perbin
+        self.num_sample_perbin = num_sample_perbin
-        self.is_dygraph = is_dygraph
        self.hidden_dim_1d = 256
        self.hidden_dim_2d = 128
@@ -124,23 +197,17 @@ class BMN(Model):
            padding=1,
            act="relu")
-        # init to speed up
+        # get sample mask 
        sample_mask_array = get_interp1d_mask(
            self.tscale, self.dscale, self.prop_boundary_ratio,
            self.num_sample, self.num_sample_perbin)
-        if self.is_dygraph:
+        self.sample_mask = fluid.layers.create_parameter(
-            self.sample_mask = fluid.dygraph.base.to_variable(
+            shape=[self.tscale, self.num_sample * self.dscale * self.tscale],
-                sample_mask_array)
+            dtype=DATATYPE,
-        else:  # static
+            attr=fluid.ParamAttr(
-            self.sample_mask = fluid.layers.create_parameter(
+                name="sample_mask", trainable=False),
-                shape=[
+            default_initializer=fluid.initializer.NumpyArrayInitializer(
-                    self.tscale, self.num_sample * self.dscale * self.tscale
+                sample_mask_array))
-                ],
-                dtype=DATATYPE,
-                attr=fluid.ParamAttr(
-                    name="sample_mask", trainable=False),
-                default_initializer=fluid.initializer.NumpyArrayInitializer(
-                    sample_mask_array))
        self.sample_mask.stop_gradient = True
@@ -221,21 +288,30 @@ class BMN(Model):
 class BmnLoss(Loss):
-    def __init__(self, cfg):
+    """Loss for BMN model
+    Args:
+        tscale (int): sequence length, default 100.
+        dscale (int): max duration length, default 100.
+    """
+    def __init__(self, tscale, dscale):
        super(BmnLoss, self).__init__()
-        self.cfg = cfg
+        self.tscale = tscale
+        self.dscale = dscale
    def _get_mask(self):
-        dscale = self.cfg.MODEL.dscale
-        tscale = self.cfg.MODEL.tscale
        bm_mask = []
-        for idx in range(dscale):
+        for idx in range(self.dscale):
-            mask_vector = [1 for i in range(tscale - idx)
+            mask_vector = [1 for i in range(self.tscale - idx)
                           ] + [0 for i in range(idx)]
            bm_mask.append(mask_vector)
        bm_mask = np.array(bm_mask, dtype=np.float32)
        self_bm_mask = fluid.layers.create_global_var(
-            shape=[dscale, tscale], value=0, dtype=DATATYPE, persistable=True)
+            shape=[self.dscale, self.tscale],
+            value=0,
+            dtype=DATATYPE,
+            persistable=True)
        fluid.layers.assign(bm_mask, self_bm_mask)
        self_bm_mask.stop_gradient = True
        return self_bm_mask
@@ -362,3 +438,29 @@ class BmnLoss(Loss):
        loss = tem_loss + 10 * pem_reg_loss + pem_cls_loss
        return loss
+def bmn(tscale,
+        dscale,
+        prop_boundary_ratio,
+        num_sample,
+        num_sample_perbin,
+        pretrained=True):
+    """BMN model
+    Args:
+        tscale (int): sequence length, default 100.
+        dscale (int): max duration length, default 100.
+        prop_boundary_ratio (float): ratio of expanded temporal region in proposal boundary, default 0.5. 
+        num_sample (int): number of samples betweent starting boundary and ending boundary of each propoasl, default 32.
+        num_sample_perbin (int):  number of selected points in each sample, default 3.
+        pretrained (bool): If True, returns a model with pre-trained model, default True.
+    """
+    model = BMN(tscale, dscale, prop_boundary_ratio, num_sample,
+                num_sample_perbin)
+    if pretrained:
+        weight_path = get_weights_path(*(pretrain_infos['bmn']))
+        assert weight_path.endswith('.pdparams'), \
+                "suffix of weight must be .pdparams"
+        model.load(weight_path)
+    return model
--- a/bmn/predict.py
+++ b/bmn/predict.py
@@ -18,11 +18,10 @@ import os
 import logging
 import paddle.fluid as fluid
-sys.path.append('../')
+from hapi.model import set_device, Input
-from model import set_device, Input
+from modeling import bmn, BmnLoss
 from bmn_metric import BmnMetric
-from bmn_model import BMN, BmnLoss
 from reader import BmnDataset
 from config_utils import *
@@ -39,7 +38,6 @@ def parse_args():
    parser.add_argument(
        "-d",
        "--dynamic",
-        default=True,
        action='store_true',
        help="enable dygraph mode, only support dynamic mode at present time")
    parser.add_argument(
@@ -52,14 +50,25 @@ def parse_args():
    parser.add_argument(
        '--weights',
        type=str,
-        default="checkpoint/final",
+        default=None,
        help='weight path, None to automatically download weights provided by Paddle.'
    )
    parser.add_argument(
-        '--save_dir',
+        '--filelist',
+        type=str,
+        default="infer.list",
+        help='infer file list, default to use ./infer.list')
+    parser.add_argument(
+        '--output_path',
+        type=str,
+        default="output/INFER/BMN_results",
+        help='output dir path, default to use output/INFER/BMN_results')
+    parser.add_argument(
+        '--result_path',
        type=str,
        default="predict_results/",
-        help='output dir path, default to use ./predict_results/')
+        help='output dir path after post processing, default to use ./predict_results/'
+    )
    parser.add_argument(
        '--log_interval',
        type=int,
@@ -71,18 +80,21 @@ def parse_args():
 # Prediction
 def infer_bmn(args):
-    # only support dynamic mode at present time
    device = set_device(args.device)
    fluid.enable_dygraph(device) if args.dynamic else None
+    #config setting
    config = parse_config(args.config_file)
    infer_cfg = merge_configs(config, 'infer', vars(args))
-    if not os.path.isdir(config.INFER.output_path):
+    feat_dim = config.MODEL.feat_dim
-        os.makedirs(config.INFER.output_path)
+    tscale = config.MODEL.tscale
-    if not os.path.isdir(config.INFER.result_path):
+    dscale = config.MODEL.dscale
-        os.makedirs(config.INFER.result_path)
+    prop_boundary_ratio = config.MODEL.prop_boundary_ratio
+    num_sample = config.MODEL.num_sample
+    num_sample_perbin = config.MODEL.num_sample_perbin
+    #input and video index
    inputs = [
        Input(
            [None, config.MODEL.feat_dim, config.MODEL.tscale],
@@ -94,7 +106,13 @@ def infer_bmn(args):
    #data
    infer_dataset = BmnDataset(infer_cfg, 'infer')
-    model = BMN(config, args.dynamic)
+    #model
+    model = bmn(tscale,
+                dscale,
+                prop_boundary_ratio,
+                num_sample,
+                num_sample_perbin,
+                pretrained=args.weights is None)
    model.prepare(
        metrics=BmnMetric(
            config, mode='infer'),
@@ -103,12 +121,12 @@ def infer_bmn(args):
        device=device)
    # load checkpoint
-    if args.weights:
+    if args.weights is not None:
        assert os.path.exists(
            args.weights +
            ".pdparams"), "Given weight dir {} not exist.".format(args.weights)
-    logger.info('load test weights from {}'.format(args.weights))
+        logger.info('load test weights from {}'.format(args.weights))
-    model.load(args.weights)
+        model.load(args.weights)
    # here use model.eval instead of model.test, as post process is required in our case
    model.evaluate(

--- a/bmn/reader.py
+++ b/bmn/reader.py
@@ -21,8 +21,8 @@ import sys
 sys.path.append('../')
-from distributed import DistributedBatchSampler
+from hapi.distributed import DistributedBatchSampler
-from paddle.fluid.io import Dataset, DataLoader
+from paddle.io import Dataset, DataLoader
 logger = logging.getLogger(__name__)

--- a/bmn/run.sh
+++ b/bmn/run.sh
 export CUDA_VISIBLE_DEVICES=0,1,2,3
 python -m paddle.distributed.launch train.py
--- a/bmn/train.py
+++ b/bmn/train.py
@@ -18,12 +18,11 @@ import logging
 import sys
 import os
-sys.path.append('../')
+from hapi.model import set_device, Input
-from model import set_device, Input
-from bmn_model import BMN, BmnLoss
 from reader import BmnDataset
 from config_utils import *
+from modeling import bmn, BmnLoss
 DATATYPE = 'float32'
@@ -36,11 +35,7 @@ logger = logging.getLogger(__name__)
 def parse_args():
    parser = argparse.ArgumentParser("Paddle high level api of BMN.")
    parser.add_argument(
-        "-d",
+        "-d", "--dynamic", action='store_true', help="enable dygraph mode")
-        "--dynamic",
-        default=True,
-        action='store_true',
-        help="enable dygraph mode")
    parser.add_argument(
        '--config_file',
        type=str,
@@ -50,7 +45,7 @@ def parse_args():
        '--batch_size',
        type=int,
        default=None,
-        help='training batch size. None to use config file setting.')
+        help='training batch size. None for read from config file.')
    parser.add_argument(
        '--learning_rate',
        type=float,
@@ -70,8 +65,8 @@ def parse_args():
    parser.add_argument(
        '--epoch',
        type=int,
-        default=9,
+        default=None,
-        help='epoch number, 0 for read from config file')
+        help='epoch number, None for read from config file')
    parser.add_argument(
        '--valid_interval',
        type=int,
@@ -115,22 +110,23 @@ def train_bmn(args):
    if not os.path.isdir(args.save_dir):
        os.makedirs(args.save_dir)
+    #config setting
    config = parse_config(args.config_file)
    train_cfg = merge_configs(config, 'train', vars(args))
    val_cfg = merge_configs(config, 'valid', vars(args))
-    inputs = [
+    feat_dim = config.MODEL.feat_dim
-        Input(
+    tscale = config.MODEL.tscale
-            [None, config.MODEL.feat_dim, config.MODEL.tscale],
+    dscale = config.MODEL.dscale
-            'float32',
+    prop_boundary_ratio = config.MODEL.prop_boundary_ratio
-            name='feat_input')
+    num_sample = config.MODEL.num_sample
-    ]
+    num_sample_perbin = config.MODEL.num_sample_perbin
-    gt_iou_map = Input(
-        [None, config.MODEL.dscale, config.MODEL.tscale],
+    # input and label list
-        'float32',
+    inputs = [Input([None, feat_dim, tscale], 'float32', name='feat_input')]
-        name='gt_iou_map')
+    gt_iou_map = Input([None, dscale, tscale], 'float32', name='gt_iou_map')
-    gt_start = Input([None, config.MODEL.tscale], 'float32', name='gt_start')
+    gt_start = Input([None, tscale], 'float32', name='gt_start')
-    gt_end = Input([None, config.MODEL.tscale], 'float32', name='gt_end')
+    gt_end = Input([None, tscale], 'float32', name='gt_end')
    labels = [gt_iou_map, gt_start, gt_end]
    # data
@@ -138,11 +134,16 @@ def train_bmn(args):
    val_dataset = BmnDataset(val_cfg, 'valid')
    # model
-    model = BMN(config, args.dynamic)
+    model = bmn(tscale,
+                dscale,
+                prop_boundary_ratio,
+                num_sample,
+                num_sample_perbin,
+                pretrained=False)
    optim = optimizer(config, parameter_list=model.parameters())
    model.prepare(
        optimizer=optim,
-        loss_function=BmnLoss(config),
+        loss_function=BmnLoss(tscale, dscale),
        inputs=inputs,
        labels=labels,
        device=device)
@@ -150,11 +151,10 @@ def train_bmn(args):
    # if resume weights is given, load resume weights directly
    if args.resume is not None:
        model.load(args.resume)
    model.fit(train_data=train_dataset,
              eval_data=val_dataset,
              batch_size=train_cfg.TRAIN.batch_size,
-              epochs=args.epoch,
+              epochs=train_cfg.TRAIN.epoch,
              eval_freq=args.valid_interval,
              log_freq=args.log_interval,
              save_dir=args.save_dir,

--- a/cyclegan/README.md
+++ b/cyclegan/README.md
@@ -80,12 +80,19 @@ data/cityscapes/testA/412_A.jpg
 ### 训练
-在GPU单卡上训练:
+在GPU单卡上静态图训练:
 ```
-env CUDA_VISIBLE_DEVICES=0 python train.py
+env CUDA_VISIBLE_DEVICES=0 python train.py --checkpoint_path=checkpoint_static
 ```
+在GPU单卡上动态图训练:
+```
+env CUDA_VISIBLE_DEVICES=0 python train.py --dynamic --checkpoint_path=checkpoint_dynamic
+```
 执行`python train.py --help`可查看更多使用方式和参数详细说明。
 图1为训练152轮的训练损失示意图，其中横坐标轴为训练轮数，纵轴为在训练集上的损失。其中，'g_loss','da_loss'和'db_loss'分别为生成器、判别器A和判别器B的训练损失。

--- a/cyclegan/__init__.py
+++ b/cyclegan/__init__.py
--- a/cyclegan/check.py
+++ b/cyclegan/check.py
--- a/cyclegan/cyclegan.py
+++ b/cyclegan/cyclegan.py
@@ -18,9 +18,10 @@ from __future__ import print_function
 import numpy as np
-from layers import ConvBN, DeConvBN
 import paddle.fluid as fluid
-from model import Model, Loss
+from hapi.model import Model, Loss
+from layers import ConvBN, DeConvBN
 class ResnetBlock(fluid.dygraph.Layer):

--- a/cyclegan/data.py
+++ b/cyclegan/data.py
@@ -20,6 +20,8 @@ import random
 import numpy as np
 from PIL import Image, ImageOps
+import paddle
 DATASET = "cityscapes"
 A_LIST_FILE = "./data/" + DATASET + "/trainA.txt"
 B_LIST_FILE = "./data/" + DATASET + "/trainB.txt"
@@ -27,10 +29,8 @@ A_TEST_LIST_FILE = "./data/" + DATASET + "/testA.txt"
 B_TEST_LIST_FILE = "./data/" + DATASET + "/testB.txt"
 IMAGES_ROOT = "./data/" + DATASET + "/"
-import paddle.fluid as fluid
-class Cityscapes(fluid.io.Dataset):
+class Cityscapes(paddle.io.Dataset):
    def __init__(self, root_path, file_path, mode='train', return_name=False):
        self.root_path = root_path
        self.file_path = file_path

--- a/cyclegan/image/A2B.png
+++ b/cyclegan/image/A2B.png
--- a/cyclegan/image/B2A.png
+++ b/cyclegan/image/B2A.png
--- a/cyclegan/image/net.png
+++ b/cyclegan/image/net.png
--- a/cyclegan/image/testA/123_A.jpg
+++ b/cyclegan/image/testA/123_A.jpg
--- a/cyclegan/image/testB/78_B.jpg
+++ b/cyclegan/image/testB/78_B.jpg
--- a/cyclegan/infer.py
+++ b/cyclegan/infer.py
@@ -25,9 +25,9 @@ from PIL import Image
 from scipy.misc import imsave
 import paddle.fluid as fluid
-from check import check_gpu, check_version
+from hapi.model import Model, Input, set_device
-from model import Model, Input, set_device
+from check import check_gpu, check_version
 from cyclegan import Generator, GeneratorCombine
@@ -43,7 +43,7 @@ def main():
    im_shape = [-1, 3, 256, 256]
    input_A = Input(im_shape, 'float32', 'input_A')
    input_B = Input(im_shape, 'float32', 'input_B')
-    g.prepare(inputs=[input_A, input_B])
+    g.prepare(inputs=[input_A, input_B], device=FLAGS.device)
    g.load(FLAGS.init_model, skip_mismatch=True, reset_optimizer=True)
    out_path = FLAGS.output + "/single"
@@ -59,10 +59,10 @@ def main():
        data = image.transpose([2, 0, 1])[np.newaxis, :]
        if FLAGS.input_style == "A":
-            _, fake, _, _ = g.test([data, data])
+            _, fake, _, _ = g.test_batch([data, data])
        if FLAGS.input_style == "B":
-            fake, _, _, _ = g.test([data, data])
+            fake, _, _, _ = g.test_batch([data, data])
        fake = np.squeeze(fake[0]).transpose([1, 2, 0])
@@ -74,7 +74,7 @@ def main():
 if __name__ == "__main__":
    parser = argparse.ArgumentParser("CycleGAN inference")
    parser.add_argument(
-        "-d", "--dynamic", action='store_false', help="Enable dygraph mode")
+        "-d", "--dynamic", action='store_true', help="Enable dygraph mode")
    parser.add_argument(
        "-p",
        "--device",

--- a/cyclegan/layers.py
+++ b/cyclegan/layers.py
--- a/cyclegan/test.py
+++ b/cyclegan/test.py
@@ -22,9 +22,9 @@ import numpy as np
 from scipy.misc import imsave
 import paddle.fluid as fluid
-from check import check_gpu, check_version
+from hapi.model import Model, Input, set_device
-from model import Model, Input, set_device
+from check import check_gpu, check_version
 from cyclegan import Generator, GeneratorCombine
 import data as data
@@ -41,7 +41,7 @@ def main():
    im_shape = [-1, 3, 256, 256]
    input_A = Input(im_shape, 'float32', 'input_A')
    input_B = Input(im_shape, 'float32', 'input_B')
-    g.prepare(inputs=[input_A, input_B])
+    g.prepare(inputs=[input_A, input_B], device=FLAGS.device)
    g.load(FLAGS.init_model, skip_mismatch=True, reset_optimizer=True)
    if not os.path.exists(FLAGS.output):
@@ -56,7 +56,7 @@ def main():
        data_A = np.array(data_A).astype("float32")
        data_B = np.array(data_B).astype("float32")
-        fake_A, fake_B, cyc_A, cyc_B = g.test([data_A, data_B])
+        fake_A, fake_B, cyc_A, cyc_B = g.test_batch([data_A, data_B])
        datas = [fake_A, fake_B, cyc_A, cyc_B, data_A, data_B]
        odatas = []
@@ -75,7 +75,7 @@ def main():
 if __name__ == "__main__":
    parser = argparse.ArgumentParser("CycleGAN test")
    parser.add_argument(
-        "-d", "--dynamic", action='store_false', help="Enable dygraph mode")
+        "-d", "--dynamic", action='store_true', help="Enable dygraph mode")
    parser.add_argument(
        "-p",
        "--device",

--- a/cyclegan/train.py
+++ b/cyclegan/train.py
@@ -24,12 +24,11 @@ import time
 import paddle
 import paddle.fluid as fluid
-from check import check_gpu, check_version
+from hapi.model import Model, Input, set_device
-from model import Model, Input, set_device
-import data as data
+from check import check_gpu, check_version
 from cyclegan import Generator, Discriminator, GeneratorCombine, GLoss, DLoss
+import data as data
 step_per_epoch = 2974
@@ -76,23 +75,26 @@ def main():
    fake_A = Input(im_shape, 'float32', 'fake_A')
    fake_B = Input(im_shape, 'float32', 'fake_B')
-    g_AB.prepare(inputs=[input_A])
+    g_AB.prepare(inputs=[input_A], device=FLAGS.device)
-    g_BA.prepare(inputs=[input_B])
+    g_BA.prepare(inputs=[input_B], device=FLAGS.device)
-    g.prepare(g_optimizer, GLoss(), inputs=[input_A, input_B])
+    g.prepare(g_optimizer, GLoss(), inputs=[input_A, input_B],
-    d_A.prepare(da_optimizer, DLoss(), inputs=[input_B, fake_B])
+        device=FLAGS.device)
-    d_B.prepare(db_optimizer, DLoss(), inputs=[input_A, fake_A])
+    d_A.prepare(da_optimizer, DLoss(), inputs=[input_B, fake_B],
+        device=FLAGS.device)
+    d_B.prepare(db_optimizer, DLoss(), inputs=[input_A, fake_A],
+        device=FLAGS.device)
    if FLAGS.resume:
        g.load(FLAGS.resume)
-    loader_A = fluid.io.DataLoader(
+    loader_A = paddle.io.DataLoader(
        data.DataA(),
        places=place,
        shuffle=True,
        return_list=True,
        batch_size=FLAGS.batch_size)
-    loader_B = fluid.io.DataLoader(
+    loader_B = paddle.io.DataLoader(
        data.DataB(),
        places=place,
        shuffle=True,
@@ -108,14 +110,14 @@ def main():
            data_B = data_B[0][0] if not FLAGS.dynamic else data_B[0]
            start = time.time()
-            fake_B = g_AB.test(data_A)[0]
+            fake_B = g_AB.test_batch(data_A)[0]
-            fake_A = g_BA.test(data_B)[0]
+            fake_A = g_BA.test_batch(data_B)[0]
-            g_loss = g.train([data_A, data_B])[0]
+            g_loss = g.train_batch([data_A, data_B])[0]
            fake_pb = B_pool.get(fake_B)
-            da_loss = d_A.train([data_B, fake_pb])[0]
+            da_loss = d_A.train_batch([data_B, fake_pb])[0]
            fake_pa = A_pool.get(fake_A)
-            db_loss = d_B.train([data_A, fake_pa])[0]
+            db_loss = d_B.train_batch([data_A, fake_pa])[0]
            t = time.time() - start
            if i % 20 == 0:
@@ -128,7 +130,7 @@ def main():
 if __name__ == "__main__":
    parser = argparse.ArgumentParser("CycleGAN Training on Cityscapes")
    parser.add_argument(
-        "-d", "--dynamic", action='store_false', help="Enable dygraph mode")
+        "-d", "--dynamic", action='store_true', help="Enable dygraph mode")
    parser.add_argument(
        "-p",
        "--device",

--- a/image_classification/README.MD
+++ b/image_classification/README.MD
@@ -43,13 +43,13 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python -m paddle.distributed.launch main.py --arch
 ### 单卡预测
 执行如下命令进行预测
 ```bash
-python -u main.py --arch resnet50 -d --evaly-only /path/to/imagenet 
+python -u main.py --arch resnet50 -d --eval-only /path/to/imagenet 
 ```
 ### 多卡预测
 执行如下命令进行多卡预测
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 python -m paddle.distributed.launch main.py --arch resnet50 --evaly-only /path/to/imagenet
+CUDA_VISIBLE_DEVICES=0,1,2,3 python -m paddle.distributed.launch main.py --arch resnet50 --eval-only /path/to/imagenet
 ```
@@ -71,15 +71,20 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python -m paddle.distributed.launch main.py --arch
 * **weight-decay**: 模型权重正则化系数，默认值：1e-4
 * **momentum**: SGD优化器的动量，默认值：0.9
+注意：使用```--resume```恢复训练时，假如你的模型路径为```./output/118.pdparams```，你输入的路径不需要带后缀，即```--resume ./output/118```即可。
 ## 模型
 | 模型 | top1 acc | top5 acc |
 | --- | --- | --- |
-| [ResNet50](https://paddle-hapi.bj.bcebos.com/models/resnet50.pdparams) | 76.28 | 93.04 |
+| [ResNet18](https://paddle-hapi.bj.bcebos.com/models/resnet18.pdparams) | 71.72 | 90.60 |
-| [vgg16](https://paddle-hapi.bj.bcebos.com/models/vgg16.pdparams) | 71.84 | 90.71 | 
+| [ResNet34](https://paddle-hapi.bj.bcebos.com/models/resnet34.pdparams) | 75.02 | 92.31 |
-| [mobilenet_v1](https://paddle-hapi.bj.bcebos.com/models/mobilenet_v1_x1.0.pdparams) | 71.25 | 89.92 | 
+| [ResNet50](https://paddle-hapi.bj.bcebos.com/models/resnet50.pdparams) | 76.27 | 93.03 |
-| [mobilenet_v2](https://paddle-hapi.bj.bcebos.com/models/mobilenet_v2_x1.0.pdparams) | 72.27 | 90.66 | 
+| [ResNet101](https://paddle-hapi.bj.bcebos.com/models/resnet101.pdparams) | 78.33 | 94.04 |
+| [ResNet152](https://paddle-hapi.bj.bcebos.com/models/resnet152.pdparams) | 78.78 | 94.40 |
+| [vgg16](https://paddle-hapi.bj.bcebos.com/models/vgg16.pdparams) | 71.92 | 90.65 | 
+| [mobilenet_v1](https://paddle-hapi.bj.bcebos.com/models/mobilenet_v1_x1.0.pdparams) | 71.16 | 89.89 | 
+| [mobilenet_v2](https://paddle-hapi.bj.bcebos.com/models/mobilenet_v2_x1.0.pdparams) | 72.30 | 90.74 | 
 上述模型的复现参数请参考scripts下的脚本。

--- a/image_classification/imagenet_dataset.py
+++ b/image_classification/imagenet_dataset.py
@@ -18,81 +18,35 @@ import math
 import random
 import numpy as np
-from datasets.folder import DatasetFolder
+from hapi.datasets import DatasetFolder
+from hapi.vision.transforms import transforms
+from paddle import fluid
-def center_crop_resize(img):
-    h, w = img.shape[:2]
-    c = int(224 / 256 * min((h, w)))
-    i = (h + 1 - c) // 2
-    j = (w + 1 - c) // 2
-    img = img[i:i + c, j:j + c, :]
-    return cv2.resize(img, (224, 224), 0, 0, cv2.INTER_LINEAR)
-def random_crop_resize(img):
-    height, width = img.shape[:2]
-    area = height * width
-    for attempt in range(10):
-        target_area = random.uniform(0.08, 1.) * area
-        log_ratio = (math.log(3 / 4), math.log(4 / 3))
-        aspect_ratio = math.exp(random.uniform(*log_ratio))
-        w = int(round(math.sqrt(target_area * aspect_ratio)))
-        h = int(round(math.sqrt(target_area / aspect_ratio)))
-        if w <= width and h <= height:
-            i = random.randint(0, height - h)
-            j = random.randint(0, width - w)
-            img = img[i:i + h, j:j + w, :]
-            return cv2.resize(img, (224, 224), 0, 0, cv2.INTER_LINEAR)
-    return center_crop_resize(img)
-def random_flip(img):
-    if np.random.randint(0, 2) == 1:
-        img = img[:, ::-1, :]
-    return img
-def normalize_permute(img):
-    # transpose and convert to RGB from BGR
-    img = img.astype(np.float32).transpose((2, 0, 1))[::-1, ...]
-    mean = np.array([123.675, 116.28, 103.53], dtype=np.float32)
-    std = np.array([58.395, 57.120, 57.375], dtype=np.float32)
-    invstd = 1. / std
-    for v, m, s in zip(img, mean, invstd):
-        v.__isub__(m).__imul__(s)
-    return img
-def compose(functions):
-    def process(sample):
-        img, label = sample
-        for fn in functions:
-            img = fn(img)
-        return img, label
-    return process
 class ImageNetDataset(DatasetFolder):
    def __init__(self, path, mode='train'):
        super(ImageNetDataset, self).__init__(path)
        self.mode = mode
+        normalize = transforms.Normalize(
+            mean=[123.675, 116.28, 103.53], std=[58.395, 57.120, 57.375])
        if self.mode == 'train':
-            self.transform = compose([
+            self.transform = transforms.Compose([
-                cv2.imread, random_crop_resize, random_flip, normalize_permute
+                transforms.RandomResizedCrop(224),
+                transforms.RandomHorizontalFlip(),
+                transforms.Permute(mode='CHW'), normalize
            ])
        else:
-            self.transform = compose(
+            self.transform = transforms.Compose([
-                [cv2.imread, center_crop_resize, normalize_permute])
+                transforms.Resize(256), transforms.CenterCrop(224),
+                transforms.Permute(mode='CHW'), normalize
+            ])
    def __getitem__(self, idx):
-        img, label = self.samples[idx]
+        img_path, label = self.samples[idx]
-        return self.transform((img, [label]))
+        img = cv2.imread(img_path).astype(np.float32)
+        label = np.array([label])
+        return self.transform(img, label)
    def __len__(self):
        return len(self.samples)
--- a/image_classification/main.py
+++ b/image_classification/main.py
@@ -24,15 +24,17 @@ sys.path.append('../')
 import time
 import math
 import numpy as np
-import models
 import paddle.fluid as fluid
+from paddle.fluid.dygraph.parallel import ParallelEnv
+from paddle.io import BatchSampler, DataLoader
+from hapi.model import CrossEntropy, Input, set_device
+from hapi.distributed import DistributedBatchSampler
+from hapi.metrics import Accuracy
+import hapi.vision.models as models
-from model import CrossEntropy, Input, set_device
 from imagenet_dataset import ImageNetDataset
-from distributed import DistributedBatchSampler
-from paddle.fluid.dygraph.parallel import ParallelEnv
-from metrics import Accuracy
-from paddle.fluid.io import BatchSampler, DataLoader
 def make_optimizer(step_per_epoch, parameter_list=None):
@@ -74,6 +76,9 @@ def main():
    device = set_device(FLAGS.device)
    fluid.enable_dygraph(device) if FLAGS.dynamic else None
+    model_list = [x for x in models.__dict__["__all__"]]
+    assert FLAGS.arch in model_list, "Expected FLAGS.arch in {}, but received {}".format(
+        model_list, FLAGS.arch)
    model = models.__dict__[FLAGS.arch](pretrained=FLAGS.eval_only and
                                        not FLAGS.resume)
@@ -92,7 +97,13 @@ def main():
            len(train_dataset) * 1. / FLAGS.batch_size / ParallelEnv().nranks),
        parameter_list=model.parameters())
-    model.prepare(optim, CrossEntropy(), Accuracy(topk=(1, 5)), inputs, labels)
+    model.prepare(
+        optim,
+        CrossEntropy(),
+        Accuracy(topk=(1, 5)),
+        inputs,
+        labels,
+        FLAGS.device)
    if FLAGS.eval_only:
        model.evaluate(
@@ -150,7 +161,7 @@ if __name__ == '__main__':
        type=str,
        help="checkpoint path to resume")
    parser.add_argument(
-        "--eval-only", action='store_true', help="enable dygraph mode")
+        "--eval-only", action='store_true', help="only evaluate the model")
    parser.add_argument(
        "--lr-scheduler",
        default='piecewise',

--- a/examples/image_classification/scripts/mobilenet_v1_x1.0.sh
+++ b/examples/image_classification/scripts/mobilenet_v1_x1.0.sh
+export CUDA_VISIBLE_DEVICES=0,1,2,3 
+# 默认imagenet数据存储在data/ILSVRC2012/下，去除-d便使用静态图模式运行
+python -m paddle.distributed.launch main.py \
+        --arch mobilenet_v1 \
+        --epoch 120 \
+        --batch-size 64 \
+        --learning-rate 0.1 \
+        --lr-scheduler piecewise \
+        --milestones 30 60 90 \
+        --weight-decay 3e-5 \
+        -d \
+        data/ILSVRC2012/
\ No newline at end of file
--- a/examples/image_classification/scripts/mobilenet_v2_x1.0.sh
+++ b/examples/image_classification/scripts/mobilenet_v2_x1.0.sh
+export CUDA_VISIBLE_DEVICES=0,1,2,3 
+# 默认imagenet数据存储在data/ILSVRC2012/下，去除-d便使用静态图模式运行
+python -m paddle.distributed.launch main.py \
+        --arch mobilenet_v2 \
+        --epoch 240 \
+        --batch-size 64 \
+        --learning-rate 0.1 \
+        --lr-scheduler cosine \
+        --weight-decay 4e-5 \
+        -d \
+        data/ILSVRC2012/
\ No newline at end of file
--- a/examples/image_classification/scripts/resnet101.sh
+++ b/examples/image_classification/scripts/resnet101.sh
+export CUDA_VISIBLE_DEVICES=0,1,2,3 
+# 默认imagenet数据存储在data/ILSVRC2012/下，去除-d便使用静态图模式运行
+python -m paddle.distributed.launch main.py \
+        --arch resnet101 \
+        --epoch 90 \
+        --batch-size 64 \
+        --learning-rate 0.1 \
+        -d \
+        data/ILSVRC2012/
\ No newline at end of file
--- a/examples/image_classification/scripts/resnet152.sh
+++ b/examples/image_classification/scripts/resnet152.sh
+export CUDA_VISIBLE_DEVICES=0,1,2,3 
+# 默认imagenet数据存储在data/ILSVRC2012/下，去除-d便使用静态图模式运行
+python -m paddle.distributed.launch main.py \
+        --arch resnet152 \
+        --epoch 90 \
+        --batch-size 64 \
+        --learning-rate 0.1 \
+        -d \
+        data/ILSVRC2012/
\ No newline at end of file
--- a/examples/image_classification/scripts/resnet18.sh
+++ b/examples/image_classification/scripts/resnet18.sh
+export CUDA_VISIBLE_DEVICES=0,1,2,3 
+# 默认imagenet数据存储在data/ILSVRC2012/下，去除-d便使用静态图模式运行
+python -m paddle.distributed.launch main.py \
+        --arch resnet18 \
+        --epoch 120 \
+        --batch-size 64 \
+        --learning-rate 0.1 \
+        --lr-scheduler cosine \
+        -d \
+        data/ILSVRC2012/
\ No newline at end of file
--- a/examples/image_classification/scripts/resnet34.sh
+++ b/examples/image_classification/scripts/resnet34.sh
+export CUDA_VISIBLE_DEVICES=0,1,2,3 
+# 默认imagenet数据存储在data/ILSVRC2012/下，去除-d便使用静态图模式运行
+python -m paddle.distributed.launch main.py \
+        --arch resnet34 \
+        --epoch 120 \
+        --batch-size 64 \
+        --learning-rate 0.1 \
+        --lr-scheduler cosine \
+        -d \
+        data/ILSVRC2012/
\ No newline at end of file
--- a/examples/image_classification/scripts/resnet50.sh
+++ b/examples/image_classification/scripts/resnet50.sh
+export CUDA_VISIBLE_DEVICES=0,1,2,3 
+# 默认imagenet数据存储在data/ILSVRC2012/下，去除-d便使用静态图模式运行
+python -m paddle.distributed.launch main.py \
+        --arch resnet50 \
+        --epoch 90 \
+        --batch-size 64 \
+        --learning-rate 0.1 \
+        -d \
+        data/ILSVRC2012/
\ No newline at end of file
--- a/examples/image_classification/scripts/vgg16.sh
+++ b/examples/image_classification/scripts/vgg16.sh
+export CUDA_VISIBLE_DEVICES=0,1,2,3 
+# 默认imagenet数据存储在data/ILSVRC2012/下，去除-d便使用静态图模式运行
+python -m paddle.distributed.launch main.py \
+        --arch vgg16 \
+        --epoch 90 \
+        --batch-size 64 \
+        --learning-rate 0.01 \
+        --lr-scheduler cosine \
+        -d \
+        data/ILSVRC2012/
\ No newline at end of file
--- a/examples/ocr/README.md
+++ b/examples/ocr/README.md
+简介
+--------
+本OCR任务是识别图片单行的字母信息，基于attention的seq2seq结构。 运行本目录下的程序示例需要使用PaddlePaddle develop最新版本。
+## 代码结构
+```
+.
+|-- data.py          # 数据读取
+|-- eval.py          # 评估脚本
+|-- images           # 测试图片
+|-- predict.py       # 预测脚本
+|-- seq2seq_attn.py  # 模型
+|-- train.py         # 训练脚本
+`-- utility.py       # 公共模块
+```
+## 训练/评估/预测流程
+- 设置GPU环境:
+```
+export CUDA_VISIBLE_DEVICES=0
+```
+- 训练
+```
+python train.py
+```
+更多参数可以通过`--help`查看。
+- 动静切换
+```
+python train.py --dynamic=True
+```
+- 评估
+```
+python eval.py --init_model=checkpoint/final
+```
+- 预测
+目前不支持动态图预测
+```
+python predict.py --init_model=checkpoint/final --image_path=images/ --dynamic=False --beam_size=3
+```
+预测结果如下:
+```
+Image 1: images/112_chubbiness_13557.jpg
+0: chubbines
+1: chubbiness
+2: chubbinesS
+Image 2: images/177_Interfiled_40185.jpg
+0: Interflied
+1: Interfiled
+2: InterfIled
+Image 3: images/325_dame_19109.jpg
+0: da
+1: damo
+2: dame
+Image 4: images/368_fixtures_29232.jpg
+0: firtures
+1: Firtures
+2: fixtures
+```
--- a/examples/ocr/data.py
+++ b/examples/ocr/data.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+from os import path
+import random
+import traceback
+import copy
+import math
+import tarfile
+from PIL import Image
+import logging
+logger = logging.getLogger(__name__)
+import paddle
+from paddle import fluid
+from paddle.fluid.dygraph.parallel import ParallelEnv
+DATA_MD5 = "7256b1d5420d8c3e74815196e58cdad5"
+DATA_URL = "http://paddle-ocr-data.bj.bcebos.com/data.tar.gz"
+CACHE_DIR_NAME = "attention_data"
+SAVED_FILE_NAME = "data.tar.gz"
+DATA_DIR_NAME = "data"
+TRAIN_DATA_DIR_NAME = "train_images"
+TEST_DATA_DIR_NAME = "test_images"
+TRAIN_LIST_FILE_NAME = "train.list"
+TEST_LIST_FILE_NAME = "test.list"
+class Resize(object):
+    def __init__(self, height=48):
+        self.interp = Image.NEAREST  # Image.ANTIALIAS
+        self.height = height
+    def __call__(self, samples):
+        shape = samples[0][0].size
+        for i in range(len(samples)):
+            im = samples[i][0]
+            im = im.resize((shape[0], self.height), self.interp)
+            samples[i][0] = im
+        return samples
+class Normalize(object):
+    def __init__(self,
+                 mean=[127.5],
+                 std=[1.0],
+                 scale=False,
+                 channel_first=True):
+        self.mean = mean
+        self.std = std
+        self.scale = scale
+        self.channel_first = channel_first
+        if not (isinstance(self.mean, list) and isinstance(self.std, list) and
+                isinstance(self.scale, bool)):
+            raise TypeError("{}: input type is invalid.".format(self))
+    def __call__(self, samples):
+        for i in range(len(samples)):
+            im = samples[i][0]
+            im = np.array(im).astype(np.float32, copy=False)
+            im = im[np.newaxis, ...]
+            mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
+            std = np.array(self.std)[np.newaxis, np.newaxis, :]
+            if self.scale:
+                im = im / 255.0
+            #im -= mean
+            im -= 127.5
+            #im /= std
+            samples[i][0] = im
+        return samples
+class PadTarget(object):
+    def __init__(self, SOS=0, EOS=1):
+        self.SOS = SOS
+        self.EOS = EOS
+    def __call__(self, samples):
+        lens = np.array([len(s[1]) for s in samples], dtype="int64")
+        max_len = np.max(lens)
+        for i in range(len(samples)):
+            label = samples[i][1]
+            if max_len > len(label):
+                pad_label = label + [self.EOS] * (max_len - len(label))
+            else:
+                pad_label = label
+            samples[i][1] = np.array([self.SOS] + pad_label, dtype='int64')
+            # label_out
+            samples[i].append(np.array(pad_label + [self.EOS], dtype='int64'))
+            mask = np.zeros((max_len + 1)).astype('float32')
+            mask[:len(label) + 1] = 1.0
+            # mask
+            samples[i].append(np.array(mask, dtype='float32'))
+        return samples
+class BatchSampler(fluid.io.BatchSampler):
+    def __init__(self,
+                 dataset,
+                 batch_size,
+                 shuffle=False,
+                 drop_last=True,
+                 seed=None):
+        self._dataset = dataset
+        self._batch_size = batch_size
+        self._shuffle = shuffle
+        self._drop_last = drop_last
+        self._random = np.random
+        self._random.seed(seed)
+        self._nranks = ParallelEnv().nranks
+        self._local_rank = ParallelEnv().local_rank
+        self._device_id = ParallelEnv().dev_id
+        self._num_samples = int(
+            math.ceil(len(self._dataset) * 1.0 / self._nranks))
+        self._total_size = self._num_samples * self._nranks
+        self._epoch = 0
+    def __iter__(self):
+        infos = copy.copy(self._dataset._sample_infos)
+        skip_num = 0
+        if self._shuffle:
+            if self._batch_size == 1:
+                self._random.RandomState(self._epoch).shuffle(infos)
+            else:  # partial shuffle
+                infos = sorted(infos, key=lambda x: x.w)
+                skip_num = random.randint(1, 100)
+        infos = infos[skip_num:] + infos[:skip_num]
+        infos += infos[:(self._total_size - len(infos))]
+        last_size = self._total_size % (self._batch_size * self._nranks)
+        batches = []
+        for i in range(self._local_rank * self._batch_size,
+                       len(infos) - last_size,
+                       self._batch_size * self._nranks):
+            batches.append(infos[i:i + self._batch_size])
+        if (not self._drop_last) and last_size != 0:
+            last_local_size = last_size // self._nranks
+            last_infos = infos[len(infos) - last_size:]
+            start = self._local_rank * last_local_size
+            batches.append(last_infos[start:start + last_local_size])
+        if self._shuffle:
+            self._random.RandomState(self._epoch).shuffle(batches)
+            self._epoch += 1
+        for batch in batches:
+            batch_indices = [info.idx for info in batch]
+            yield batch_indices
+    def __len__(self):
+        if self._drop_last:
+            return self._total_size // self._batch_size
+        else:
+            return math.ceil(self._total_size / float(self._batch_size))
+class SampleInfo(object):
+    def __init__(self, idx, h, w, im_name, labels):
+        self.idx = idx
+        self.h = h
+        self.w = w
+        self.im_name = im_name
+        self.labels = labels
+class OCRDataset(paddle.io.Dataset):
+    def __init__(self, image_dir, anno_file):
+        self.image_dir = image_dir
+        self.anno_file = anno_file
+        self._sample_infos = []
+        with open(anno_file, 'r') as f:
+            for i, line in enumerate(f):
+                w, h, im_name, labels = line.strip().split(' ')
+                h, w = int(h), int(w)
+                labels = [int(c) for c in labels.split(',')]
+                self._sample_infos.append(SampleInfo(i, h, w, im_name, labels))
+    def __getitem__(self, idx):
+        info = self._sample_infos[idx]
+        im_name, labels = info.im_name, info.labels
+        image = Image.open(path.join(self.image_dir, im_name)).convert('L')
+        return [image, labels]
+    def __len__(self):
+        return len(self._sample_infos)
+def train(
+        root_dir=None,
+        images_dir=None,
+        anno_file=None,
+        shuffle=True, ):
+    if root_dir is None:
+        root_dir = download_data()
+    if images_dir is None:
+        images_dir = TRAIN_DATA_DIR_NAME
+    images_dir = path.join(root_dir, TRAIN_DATA_DIR_NAME)
+    if anno_file is None:
+        anno_file = TRAIN_LIST_FILE_NAME
+    anno_file = path.join(root_dir, TRAIN_LIST_FILE_NAME)
+    return OCRDataset(images_dir, anno_file)
+def test(
+        root_dir=None,
+        images_dir=None,
+        anno_file=None,
+        shuffle=True, ):
+    if root_dir is None:
+        root_dir = download_data()
+    if images_dir is None:
+        images_dir = TEST_DATA_DIR_NAME
+    images_dir = path.join(root_dir, TEST_DATA_DIR_NAME)
+    if anno_file is None:
+        anno_file = TEST_LIST_FILE_NAME
+    anno_file = path.join(root_dir, TEST_LIST_FILE_NAME)
+    return OCRDataset(images_dir, anno_file)
+def download_data():
+    '''Download train and test data.
+    '''
+    tar_file = paddle.dataset.common.download(
+        DATA_URL, CACHE_DIR_NAME, DATA_MD5, save_name=SAVED_FILE_NAME)
+    data_dir = path.join(path.dirname(tar_file), DATA_DIR_NAME)
+    if not path.isdir(data_dir):
+        t = tarfile.open(tar_file, "r:gz")
+        t.extractall(path=path.dirname(tar_file))
+        t.close()
+    return data_dir
--- a/examples/ocr/eval.py
+++ b/examples/ocr/eval.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import functools
+import paddle.fluid.profiler as profiler
+import paddle.fluid as fluid
+from hapi.model import Input, set_device
+from hapi.vision.transforms import BatchCompose
+from utility import add_arguments, print_arguments
+from utility import SeqAccuracy, LoggerCallBack, SeqBeamAccuracy
+from utility import postprocess
+from seq2seq_attn import Seq2SeqAttModel, Seq2SeqAttInferModel, WeightCrossEntropy
+import data
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('batch_size',        int,   32,                 "Minibatch size.")
+add_arg('test_images',       str,   None,               "The directory of images to be used for test.")
+add_arg('test_list',         str,   None,               "The list file of images to be used for training.")
+add_arg('init_model',        str,   'checkpoint/final', "The init model file of directory.")
+add_arg('use_gpu',           bool,  True,               "Whether use GPU to train.")
+add_arg('encoder_size',      int,   200,                "Encoder size.")
+add_arg('decoder_size',      int,   128,                "Decoder size.")
+add_arg('embedding_dim',     int,   128,                "Word vector dim.")
+add_arg('num_classes',       int,   95,                 "Number classes.")
+add_arg('beam_size',         int,   0,                  "If set beam size, will use beam search.")
+add_arg('dynamic',           bool,  False,              "Whether to use dygraph.")
+# yapf: enable
+def main(FLAGS):
+    device = set_device("gpu" if FLAGS.use_gpu else "cpu")
+    fluid.enable_dygraph(device) if FLAGS.dynamic else None
+    model = Seq2SeqAttModel(
+        encoder_size=FLAGS.encoder_size,
+        decoder_size=FLAGS.decoder_size,
+        emb_dim=FLAGS.embedding_dim,
+        num_classes=FLAGS.num_classes)
+    # yapf: disable
+    inputs = [
+        Input([None, 1, 48, 384], "float32", name="pixel"),
+        Input([None, None], "int64", name="label_in")
+    ]
+    labels = [
+        Input([None, None], "int64", name="label_out"),
+        Input([None, None], "float32", name="mask")
+    ]
+    # yapf: enable
+    model.prepare(
+        loss_function=WeightCrossEntropy(),
+        metrics=SeqAccuracy(),
+        inputs=inputs,
+        labels=labels,
+        device=device)
+    model.load(FLAGS.init_model)
+    test_dataset = data.test()
+    test_collate_fn = BatchCompose(
+        [data.Resize(), data.Normalize(), data.PadTarget()])
+    test_sampler = data.BatchSampler(
+        test_dataset,
+        batch_size=FLAGS.batch_size,
+        drop_last=False,
+        shuffle=False)
+    test_loader = fluid.io.DataLoader(
+        test_dataset,
+        batch_sampler=test_sampler,
+        places=device,
+        num_workers=0,
+        return_list=True,
+        collate_fn=test_collate_fn)
+    model.evaluate(
+        eval_data=test_loader,
+        callbacks=[LoggerCallBack(10, 2, FLAGS.batch_size)])
+def beam_search(FLAGS):
+    device = set_device("gpu" if FLAGS.use_gpu else "cpu")
+    fluid.enable_dygraph(device) if FLAGS.dynamic else None
+    model = Seq2SeqAttInferModel(
+        encoder_size=FLAGS.encoder_size,
+        decoder_size=FLAGS.decoder_size,
+        emb_dim=FLAGS.embedding_dim,
+        num_classes=FLAGS.num_classes,
+        beam_size=FLAGS.beam_size)
+    inputs = [
+        Input(
+            [None, 1, 48, 384], "float32", name="pixel"), Input(
+                [None, None], "int64", name="label_in")
+    ]
+    labels = [
+        Input(
+            [None, None], "int64", name="label_out"), Input(
+                [None, None], "float32", name="mask")
+    ]
+    model.prepare(
+        loss_function=None,
+        metrics=SeqBeamAccuracy(),
+        inputs=inputs,
+        labels=labels,
+        device=device)
+    model.load(FLAGS.init_model)
+    test_dataset = data.test()
+    test_collate_fn = BatchCompose(
+        [data.Resize(), data.Normalize(), data.PadTarget()])
+    test_sampler = data.BatchSampler(
+        test_dataset,
+        batch_size=FLAGS.batch_size,
+        drop_last=False,
+        shuffle=False)
+    test_loader = fluid.io.DataLoader(
+        test_dataset,
+        batch_sampler=test_sampler,
+        places=device,
+        num_workers=0,
+        return_list=True,
+        collate_fn=test_collate_fn)
+    model.evaluate(
+        eval_data=test_loader,
+        callbacks=[LoggerCallBack(10, 2, FLAGS.batch_size)])
+if __name__ == '__main__':
+    FLAGS = parser.parse_args()
+    print_arguments(FLAGS)
+    if FLAGS.beam_size:
+        beam_search(FLAGS)
+    else:
+        main(FLAGS)
--- a/examples/ocr/images/112_chubbiness_13557.jpg
+++ b/examples/ocr/images/112_chubbiness_13557.jpg
--- a/examples/ocr/images/177_Interfiled_40185.jpg
+++ b/examples/ocr/images/177_Interfiled_40185.jpg
--- a/examples/ocr/images/325_dame_19109.jpg
+++ b/examples/ocr/images/325_dame_19109.jpg
--- a/examples/ocr/images/368_fixtures_29232.jpg
+++ b/examples/ocr/images/368_fixtures_29232.jpg
--- a/examples/ocr/predict.py
+++ b/examples/ocr/predict.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import os
+import sys
+import random
+import numpy as np
+import argparse
+import functools
+from PIL import Image
+import paddle.fluid.profiler as profiler
+import paddle.fluid as fluid
+from hapi.model import Input, set_device
+from hapi.datasets.folder import ImageFolder
+from hapi.vision.transforms import BatchCompose
+from utility import add_arguments, print_arguments
+from utility import postprocess, index2word
+from seq2seq_attn import Seq2SeqAttInferModel, WeightCrossEntropy
+import data
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('batch_size',        int,   1,       "Minibatch size.")
+add_arg('image_path',        str,   None,    "The directory of images to be used for test.")
+add_arg('init_model',        str,   None,    "The init model file of directory.")
+add_arg('use_gpu',           bool,  True,    "Whether use GPU to train.")
+# model hyper paramters
+add_arg('encoder_size',      int,   200,     "Encoder size.")
+add_arg('decoder_size',      int,   128,     "Decoder size.")
+add_arg('embedding_dim',     int,   128,     "Word vector dim.")
+add_arg('num_classes',       int,   95,      "Number classes.")
+add_arg('beam_size',         int,   3,       "Beam size for beam search.")
+add_arg('dynamic',           bool,  False,   "Whether to use dygraph.")
+# yapf: enable
+def main(FLAGS):
+    device = set_device("gpu" if FLAGS.use_gpu else "cpu")
+    fluid.enable_dygraph(device) if FLAGS.dynamic else None
+    model = Seq2SeqAttInferModel(
+        encoder_size=FLAGS.encoder_size,
+        decoder_size=FLAGS.decoder_size,
+        emb_dim=FLAGS.embedding_dim,
+        num_classes=FLAGS.num_classes,
+        beam_size=FLAGS.beam_size)
+    inputs = [Input([None, 1, 48, 384], "float32", name="pixel"), ]
+    model.prepare(inputs=inputs, device=device)
+    model.load(FLAGS.init_model)
+    fn = lambda p: Image.open(p).convert('L')
+    test_dataset = ImageFolder(FLAGS.image_path, loader=fn)
+    test_collate_fn = BatchCompose([data.Resize(), data.Normalize()])
+    test_loader = fluid.io.DataLoader(
+        test_dataset,
+        places=device,
+        num_workers=0,
+        return_list=True,
+        collate_fn=test_collate_fn)
+    samples = test_dataset.samples
+    #outputs = model.predict(test_loader)
+    ins_id = 0
+    for image, in test_loader:
+        image = image if FLAGS.dynamic else image[0]
+        pred = model.test_batch([image])[0]
+        pred = pred[:, :, np.newaxis] if len(pred.shape) == 2 else pred
+        pred = np.transpose(pred, [0, 2, 1])
+        for ins in pred:
+            impath = samples[ins_id]
+            ins_id += 1
+            print('Image {}: {}'.format(ins_id, impath))
+            for beam_idx, beam in enumerate(ins):
+                id_list = postprocess(beam)
+                word_list = index2word(id_list)
+                sequence = "".join(word_list)
+                print('{}: {}'.format(beam_idx, sequence))
+if __name__ == '__main__':
+    FLAGS = parser.parse_args()
+    print_arguments(FLAGS)
+    main(FLAGS)
--- a/examples/ocr/seq2seq_attn.py
+++ b/examples/ocr/seq2seq_attn.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+from paddle.fluid.layers import BeamSearchDecoder
+from hapi.text import RNNCell, RNN, DynamicDecode
+from hapi.model import Model, Loss
+class ConvBNPool(fluid.dygraph.Layer):
+    def __init__(self,
+                 in_ch,
+                 out_ch,
+                 act="relu",
+                 is_test=False,
+                 pool=True,
+                 use_cudnn=True):
+        super(ConvBNPool, self).__init__()
+        self.pool = pool
+        filter_size = 3
+        std = (2.0 / (filter_size**2 * in_ch))**0.5
+        param_0 = fluid.ParamAttr(
+            initializer=fluid.initializer.Normal(0.0, std))
+        std = (2.0 / (filter_size**2 * out_ch))**0.5
+        param_1 = fluid.ParamAttr(
+            initializer=fluid.initializer.Normal(0.0, std))
+        self.conv0 = fluid.dygraph.Conv2D(
+            in_ch,
+            out_ch,
+            3,
+            padding=1,
+            param_attr=param_0,
+            bias_attr=False,
+            act=None,
+            use_cudnn=use_cudnn)
+        self.bn0 = fluid.dygraph.BatchNorm(out_ch, act=act)
+        self.conv1 = fluid.dygraph.Conv2D(
+            out_ch,
+            out_ch,
+            filter_size=3,
+            padding=1,
+            param_attr=param_1,
+            bias_attr=False,
+            act=None,
+            use_cudnn=use_cudnn)
+        self.bn1 = fluid.dygraph.BatchNorm(out_ch, act=act)
+        if self.pool:
+            self.pool = fluid.dygraph.Pool2D(
+                pool_size=2,
+                pool_type='max',
+                pool_stride=2,
+                use_cudnn=use_cudnn,
+                ceil_mode=True)
+    def forward(self, inputs):
+        out = self.conv0(inputs)
+        out = self.bn0(out)
+        out = self.conv1(out)
+        out = self.bn1(out)
+        if self.pool:
+            out = self.pool(out)
+        return out
+class CNN(fluid.dygraph.Layer):
+    def __init__(self, in_ch=1, is_test=False):
+        super(CNN, self).__init__()
+        self.conv_bn1 = ConvBNPool(in_ch, 16)
+        self.conv_bn2 = ConvBNPool(16, 32)
+        self.conv_bn3 = ConvBNPool(32, 64)
+        self.conv_bn4 = ConvBNPool(64, 128, pool=False)
+    def forward(self, inputs):
+        conv = self.conv_bn1(inputs)
+        conv = self.conv_bn2(conv)
+        conv = self.conv_bn3(conv)
+        conv = self.conv_bn4(conv)
+        return conv
+class GRUCell(RNNCell):
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 param_attr=None,
+                 bias_attr=None,
+                 gate_activation='sigmoid',
+                 candidate_activation='tanh',
+                 origin_mode=False):
+        super(GRUCell, self).__init__()
+        self.hidden_size = hidden_size
+        self.fc_layer = fluid.dygraph.Linear(
+            input_size,
+            hidden_size * 3,
+            param_attr=param_attr,
+            bias_attr=False)
+        self.gru_unit = fluid.dygraph.GRUUnit(
+            hidden_size * 3,
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            activation=candidate_activation,
+            gate_activation=gate_activation,
+            origin_mode=origin_mode)
+    def forward(self, inputs, states):
+        # step_outputs, new_states = cell(step_inputs, states)
+        # for GRUCell, `step_outputs` and `new_states` both are hidden
+        x = self.fc_layer(inputs)
+        hidden, _, _ = self.gru_unit(x, states)
+        return hidden, hidden
+    @property
+    def state_shape(self):
+        return [self.hidden_size]
+class Encoder(fluid.dygraph.Layer):
+    def __init__(
+            self,
+            in_channel=1,
+            rnn_hidden_size=200,
+            decoder_size=128,
+            is_test=False, ):
+        super(Encoder, self).__init__()
+        self.rnn_hidden_size = rnn_hidden_size
+        self.backbone = CNN(in_ch=in_channel, is_test=is_test)
+        para_attr = fluid.ParamAttr(
+            initializer=fluid.initializer.Normal(0.0, 0.02))
+        bias_attr = fluid.ParamAttr(
+            initializer=fluid.initializer.Normal(0.0, 0.02), learning_rate=2.0)
+        self.gru_fwd = RNN(cell=GRUCell(
+            input_size=128 * 6,
+            hidden_size=rnn_hidden_size,
+            param_attr=para_attr,
+            bias_attr=bias_attr,
+            candidate_activation='relu'),
+                           is_reverse=False,
+                           time_major=False)
+        self.gru_bwd = RNN(cell=GRUCell(
+            input_size=128 * 6,
+            hidden_size=rnn_hidden_size,
+            param_attr=para_attr,
+            bias_attr=bias_attr,
+            candidate_activation='relu'),
+                           is_reverse=True,
+                           time_major=False)
+        self.encoded_proj_fc = fluid.dygraph.Linear(
+            rnn_hidden_size * 2, decoder_size, bias_attr=False)
+    def forward(self, inputs):
+        conv_features = self.backbone(inputs)
+        conv_features = fluid.layers.transpose(
+            conv_features, perm=[0, 3, 1, 2])
+        n, w, c, h = conv_features.shape
+        seq_feature = fluid.layers.reshape(conv_features, [0, -1, c * h])
+        gru_fwd, _ = self.gru_fwd(seq_feature)
+        gru_bwd, _ = self.gru_bwd(seq_feature)
+        encoded_vector = fluid.layers.concat(input=[gru_fwd, gru_bwd], axis=2)
+        encoded_proj = self.encoded_proj_fc(encoded_vector)
+        return gru_bwd, encoded_vector, encoded_proj
+class Attention(fluid.dygraph.Layer):
+    """
+    Neural Machine Translation by Jointly Learning to Align and Translate.
+    https://arxiv.org/abs/1409.0473
+    """
+    def __init__(self, decoder_size):
+        super(Attention, self).__init__()
+        self.fc1 = fluid.dygraph.Linear(
+            decoder_size, decoder_size, bias_attr=False)
+        self.fc2 = fluid.dygraph.Linear(decoder_size, 1, bias_attr=False)
+    def forward(self, encoder_vec, encoder_proj, decoder_state):
+        # alignment model, single-layer multilayer perceptron
+        decoder_state = self.fc1(decoder_state)
+        decoder_state = fluid.layers.unsqueeze(decoder_state, [1])
+        e = fluid.layers.elementwise_add(encoder_proj, decoder_state)
+        e = fluid.layers.tanh(e)
+        att_scores = self.fc2(e)
+        att_scores = fluid.layers.squeeze(att_scores, [2])
+        att_scores = fluid.layers.softmax(att_scores)
+        context = fluid.layers.elementwise_mul(
+            x=encoder_vec, y=att_scores, axis=0)
+        context = fluid.layers.reduce_sum(context, dim=1)
+        return context
+class DecoderCell(RNNCell):
+    def __init__(self, encoder_size=200, decoder_size=128):
+        super(DecoderCell, self).__init__()
+        self.attention = Attention(decoder_size)
+        self.gru_cell = GRUCell(
+            input_size=encoder_size * 2 + decoder_size,
+            hidden_size=decoder_size)
+    def forward(self, current_word, states, encoder_vec, encoder_proj):
+        context = self.attention(encoder_vec, encoder_proj, states)
+        decoder_inputs = fluid.layers.concat([current_word, context], axis=1)
+        hidden, _ = self.gru_cell(decoder_inputs, states)
+        return hidden, hidden
+class Decoder(fluid.dygraph.Layer):
+    def __init__(self, num_classes, emb_dim, encoder_size, decoder_size):
+        super(Decoder, self).__init__()
+        self.decoder_attention = RNN(DecoderCell(encoder_size, decoder_size))
+        self.fc = fluid.dygraph.Linear(
+            decoder_size, num_classes + 2, act='softmax')
+    def forward(self, target, initial_states, encoder_vec, encoder_proj):
+        out, _ = self.decoder_attention(
+            target,
+            initial_states=initial_states,
+            encoder_vec=encoder_vec,
+            encoder_proj=encoder_proj)
+        pred = self.fc(out)
+        return pred
+class Seq2SeqAttModel(Model):
+    def __init__(
+            self,
+            in_channle=1,
+            encoder_size=200,
+            decoder_size=128,
+            emb_dim=128,
+            num_classes=None, ):
+        super(Seq2SeqAttModel, self).__init__()
+        self.encoder = Encoder(in_channle, encoder_size, decoder_size)
+        self.fc = fluid.dygraph.Linear(
+            input_dim=encoder_size,
+            output_dim=decoder_size,
+            bias_attr=False,
+            act='relu')
+        self.embedding = fluid.dygraph.Embedding(
+            [num_classes + 2, emb_dim], dtype='float32')
+        self.decoder = Decoder(num_classes, emb_dim, encoder_size,
+                               decoder_size)
+    def forward(self, inputs, target):
+        gru_backward, encoded_vector, encoded_proj = self.encoder(inputs)
+        decoder_boot = self.fc(gru_backward[:, 0])
+        trg_embedding = self.embedding(target)
+        prediction = self.decoder(trg_embedding, decoder_boot, encoded_vector,
+                                  encoded_proj)
+        return prediction
+class Seq2SeqAttInferModel(Seq2SeqAttModel):
+    def __init__(
+            self,
+            in_channle=1,
+            encoder_size=200,
+            decoder_size=128,
+            emb_dim=128,
+            num_classes=None,
+            beam_size=0,
+            bos_id=0,
+            eos_id=1,
+            max_out_len=20, ):
+        super(Seq2SeqAttInferModel, self).__init__(
+            in_channle, encoder_size, decoder_size, emb_dim, num_classes)
+        self.beam_size = beam_size
+        # dynamic decoder for inference
+        decoder = BeamSearchDecoder(
+            self.decoder.decoder_attention.cell,
+            start_token=bos_id,
+            end_token=eos_id,
+            beam_size=beam_size,
+            embedding_fn=self.embedding,
+            output_fn=self.decoder.fc)
+        self.infer_decoder = DynamicDecode(
+            decoder, max_step_num=max_out_len, is_test=True)
+    def forward(self, inputs, *args):
+        gru_backward, encoded_vector, encoded_proj = self.encoder(inputs)
+        decoder_boot = self.fc(gru_backward[:, 0])
+        if self.beam_size:
+            # Tile the batch dimension with beam_size
+            encoded_vector = BeamSearchDecoder.tile_beam_merge_with_batch(
+                encoded_vector, self.beam_size)
+            encoded_proj = BeamSearchDecoder.tile_beam_merge_with_batch(
+                encoded_proj, self.beam_size)
+        # dynamic decoding with beam search
+        rs, _ = self.infer_decoder(
+            inits=decoder_boot,
+            encoder_vec=encoded_vector,
+            encoder_proj=encoded_proj)
+        return rs
+class WeightCrossEntropy(Loss):
+    def __init__(self):
+        super(WeightCrossEntropy, self).__init__(average=False)
+    def forward(self, outputs, labels):
+        predict, (label, mask) = outputs[0], labels
+        loss = layers.cross_entropy(predict, label=label)
+        loss = layers.elementwise_mul(loss, mask, axis=0)
+        loss = layers.reduce_sum(loss)
+        return loss
--- a/examples/ocr/train.py
+++ b/examples/ocr/train.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import os
+import sys
+import random
+import numpy as np
+import argparse
+import functools
+import paddle.fluid.profiler as profiler
+import paddle.fluid as fluid
+from hapi.model import Input, set_device
+from hapi.vision.transforms import BatchCompose
+from utility import add_arguments, print_arguments
+from utility import SeqAccuracy, LoggerCallBack
+from seq2seq_attn import Seq2SeqAttModel, WeightCrossEntropy
+import data
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('batch_size',        int,   32,           "Minibatch size.")
+add_arg('epoch',             int,   30,           "Epoch number.")
+add_arg('num_workers',       int,   0,            "workers number.")
+add_arg('lr',                float, 0.001,        "Learning rate.")
+add_arg('lr_decay_strategy', str,   "",           "Learning rate decay strategy.")
+add_arg('checkpoint_path',   str,   "checkpoint", "The directory the model to be saved to.")
+add_arg('train_images',      str,   None,         "The directory of images to be used for training.")
+add_arg('train_list',        str,   None,         "The list file of images to be used for training.")
+add_arg('test_images',       str,   None,         "The directory of images to be used for test.")
+add_arg('test_list',         str,   None,         "The list file of images to be used for training.")
+add_arg('resume_path',       str,   None,         "The init model file of directory.")
+add_arg('use_gpu',           bool,  True,         "Whether use GPU to train.")
+# model hyper paramters
+add_arg('encoder_size',      int,   200,     "Encoder size.")
+add_arg('decoder_size',      int,   128,     "Decoder size.")
+add_arg('embedding_dim',     int,   128,     "Word vector dim.")
+add_arg('num_classes',       int,   95,     "Number classes.")
+add_arg('gradient_clip',     float, 5.0,     "Gradient clip value.")
+add_arg('dynamic',           bool,  False,      "Whether to use dygraph.")
+# yapf: enable
+def main(FLAGS):
+    device = set_device("gpu" if FLAGS.use_gpu else "cpu")
+    fluid.enable_dygraph(device) if FLAGS.dynamic else None
+    model = Seq2SeqAttModel(
+        encoder_size=FLAGS.encoder_size,
+        decoder_size=FLAGS.decoder_size,
+        emb_dim=FLAGS.embedding_dim,
+        num_classes=FLAGS.num_classes)
+    lr = FLAGS.lr
+    if FLAGS.lr_decay_strategy == "piecewise_decay":
+        learning_rate = fluid.layers.piecewise_decay(
+            [200000, 250000], [lr, lr * 0.1, lr * 0.01])
+    else:
+        learning_rate = lr
+    grad_clip = fluid.clip.GradientClipByGlobalNorm(FLAGS.gradient_clip)
+    optimizer = fluid.optimizer.Adam(
+        learning_rate=learning_rate,
+        parameter_list=model.parameters(),
+        grad_clip=grad_clip)
+    # yapf: disable
+    inputs = [
+        Input([None,1,48,384], "float32", name="pixel"),
+        Input([None, None], "int64", name="label_in"),
+    ]
+    labels = [
+        Input([None, None], "int64", name="label_out"),
+        Input([None, None], "float32", name="mask"),
+    ]
+    # yapf: enable
+    model.prepare(
+        optimizer,
+        WeightCrossEntropy(),
+        SeqAccuracy(),
+        inputs=inputs,
+        labels=labels)
+    train_dataset = data.train()
+    train_collate_fn = BatchCompose(
+        [data.Resize(), data.Normalize(), data.PadTarget()])
+    train_sampler = data.BatchSampler(
+        train_dataset, batch_size=FLAGS.batch_size, shuffle=True)
+    train_loader = fluid.io.DataLoader(
+        train_dataset,
+        batch_sampler=train_sampler,
+        places=device,
+        num_workers=FLAGS.num_workers,
+        return_list=True,
+        collate_fn=train_collate_fn)
+    test_dataset = data.test()
+    test_collate_fn = BatchCompose(
+        [data.Resize(), data.Normalize(), data.PadTarget()])
+    test_sampler = data.BatchSampler(
+        test_dataset,
+        batch_size=FLAGS.batch_size,
+        drop_last=False,
+        shuffle=False)
+    test_loader = fluid.io.DataLoader(
+        test_dataset,
+        batch_sampler=test_sampler,
+        places=device,
+        num_workers=0,
+        return_list=True,
+        collate_fn=test_collate_fn)
+    model.fit(train_data=train_loader,
+              eval_data=test_loader,
+              epochs=FLAGS.epoch,
+              save_dir=FLAGS.checkpoint_path,
+              callbacks=[LoggerCallBack(10, 2, FLAGS.batch_size)])
+if __name__ == '__main__':
+    FLAGS = parser.parse_args()
+    print_arguments(FLAGS)
+    main(FLAGS)
--- a/examples/ocr/utility.py
+++ b/examples/ocr/utility.py
+"""Contains common utility functions."""
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import distutils.util
+import numpy as np
+import paddle.fluid as fluid
+import six
+from hapi.metrics import Metric
+from hapi.callbacks import ProgBarLogger
+def print_arguments(args):
+    """Print argparse's arguments.
+    Usage:
+    .. code-block:: python
+        parser = argparse.ArgumentParser()
+        parser.add_argument("name", default="Jonh", type=str, help="User name.")
+        args = parser.parse_args()
+        print_arguments(args)
+    :param args: Input argparse.Namespace for printing.
+    :type args: argparse.Namespace
+    """
+    print("-----------  Configuration Arguments -----------")
+    for arg, value in sorted(six.iteritems(vars(args))):
+        print("%s: %s" % (arg, value))
+    print("------------------------------------------------")
+def add_arguments(argname, type, default, help, argparser, **kwargs):
+    """Add argparse's argument.
+    Usage:
+    .. code-block:: python
+        parser = argparse.ArgumentParser()
+        add_argument("name", str, "Jonh", "User name.", parser)
+        args = parser.parse_args()
+    """
+    type = distutils.util.strtobool if type == bool else type
+    argparser.add_argument(
+        "--" + argname,
+        default=default,
+        type=type,
+        help=help + ' Default: %(default)s.',
+        **kwargs)
+class SeqAccuracy(Metric):
+    def __init__(self, name=None, *args, **kwargs):
+        super(SeqAccuracy, self).__init__(*args, **kwargs)
+        self._name = 'seq_acc'
+        self.reset()
+    def add_metric_op(self, output, label, mask, *args, **kwargs):
+        pred = fluid.layers.flatten(output, axis=2)
+        score, topk = fluid.layers.topk(pred, 1)
+        return topk, label, mask
+    def update(self, topk, label, mask, *args, **kwargs):
+        topk = topk.reshape(label.shape[0], -1)
+        seq_len = np.sum(mask, -1)
+        acc = 0
+        for i in range(label.shape[0]):
+            l = int(seq_len[i] - 1)
+            pred = topk[i][:l - 1]
+            ref = label[i][:l - 1]
+            if np.array_equal(pred, ref):
+                self.total += 1
+                acc += 1
+            self.count += 1
+        return float(acc) / label.shape[0]
+    def reset(self):
+        self.total = 0.
+        self.count = 0.
+    def accumulate(self):
+        return float(self.total) / self.count
+    def name(self):
+        return self._name
+class LoggerCallBack(ProgBarLogger):
+    def __init__(self, log_freq=1, verbose=2, train_bs=None, eval_bs=None):
+        super(LoggerCallBack, self).__init__(log_freq, verbose)
+        self.train_bs = train_bs
+        self.eval_bs = eval_bs if eval_bs else train_bs
+    def on_train_batch_end(self, step, logs=None):
+        logs = logs or {}
+        logs['loss'] = [l / self.train_bs for l in logs['loss']]
+        super(LoggerCallBack, self).on_train_batch_end(step, logs)
+    def on_epoch_end(self, epoch, logs=None):
+        logs = logs or {}
+        logs['loss'] = [l / self.train_bs for l in logs['loss']]
+        super(LoggerCallBack, self).on_epoch_end(epoch, logs)
+    def on_eval_batch_end(self, step, logs=None):
+        logs = logs or {}
+        logs['loss'] = [l / self.eval_bs for l in logs['loss']]
+        super(LoggerCallBack, self).on_eval_batch_end(step, logs)
+    def on_eval_end(self, logs=None):
+        logs = logs or {}
+        logs['loss'] = [l / self.eval_bs for l in logs['loss']]
+        super(LoggerCallBack, self).on_eval_end(logs)
+def index2word(ids):
+    return [chr(int(k + 33)) for k in ids]
+def postprocess(seq, bos_idx=0, eos_idx=1):
+    if type(seq) is np.ndarray:
+        seq = seq.tolist()
+    eos_pos = len(seq) - 1
+    for i, idx in enumerate(seq):
+        if idx == eos_idx:
+            eos_pos = i
+            break
+    seq = [
+        idx for idx in seq[:eos_pos + 1] if idx != bos_idx and idx != eos_idx
+    ]
+    return seq
+class SeqBeamAccuracy(Metric):
+    def __init__(self, name=None, *args, **kwargs):
+        super(SeqBeamAccuracy, self).__init__(*args, **kwargs)
+        self._name = 'seq_acc'
+        self.reset()
+    def add_metric_op(self, output, label, mask, *args, **kwargs):
+        return output, label, mask
+    def update(self, preds, labels, masks, *args, **kwargs):
+        preds = preds[:, :, np.newaxis] if len(preds.shape) == 2 else preds
+        preds = np.transpose(preds, [0, 2, 1])
+        seq_len = np.sum(masks, -1)
+        acc = 0
+        for i in range(labels.shape[0]):
+            l = int(seq_len[i] - 1)
+            #ref = labels[i][: l - 1]
+            ref = np.array(postprocess(labels[i]))
+            pred = preds[i]
+            for idx, beam in enumerate(pred):
+                beam_pred = np.array(postprocess(beam))
+                if np.array_equal(beam_pred, ref):
+                    self.total += 1
+                    acc += 1
+                    break
+            self.count += 1
+        return float(acc) / labels.shape[0]
+    def reset(self):
+        self.total = 0.
+        self.count = 0.
+    def accumulate(self):
+        return float(self.total) / self.count
+    def name(self):
+        return self._name
--- a/examples/sequence_tagging/README.md
+++ b/examples/sequence_tagging/README.md
+# 序列标注任务
+## 1. 简介
+Sequence Tagging，是一个序列标注模型，模型可用于实现，分词、词性标注、专名识别等序列标注任务。我们在自建的数据集上对分词、词性标注、专名识别进行整体的评估效果（即联合标签模型），具体数值见下表；
+|模型|Precision|Recall|F1-score|
+|:-:|:-:|:-:|:-:|
+|Lexical Analysis|88.26%|89.20%|88.73%|
+## 2. 快速开始
+### 安装说明
+#### 1.PaddlePaddle 安装
+本项目依赖 PaddlePaddle 1.7 及以上版本和PaddleHub 1.0.0及以上版本 ，PaddlePaddle安装请参考官网 [快速安装](http://www.paddlepaddle.org/paddle#quick-start)，PaddleHub安装参考 [PaddleHub](https://github.com/PaddlePaddle/PaddleHub)。
+> Warning: GPU 和 CPU 版本的 PaddlePaddle 分别是 paddlepaddle-gpu 和 paddlepaddle，请安装时注意区别。
+#### 2. 克隆代码
+克隆工具集代码库到本地
+```bash
+ git clone https://github.com/PaddlePaddle/hapi.git
+ cd hapi/sequence_tagging
+```
+#### 3. 环境依赖
+PaddlePaddle的版本要求是：Python 2 版本是 2.7.15+、Python 3 版本是 3.5.1+/3.6/3.7。sequence tagging的代码可支持Python2/3，无具体版本限制
+### 数据准备
+#### 1. 快速下载
+本项目涉及的**数据集**和**训练模型**的数据可通过执行以下脚本进行快速下载，若仅需使用部分数据或者模型，可根据需要参照2和3进行下载
+```bash
+python downloads.py all
+```
+或在支持运行shell脚本的环境下执行：
+```bash
+sh downloads.sh
+```
+#### 2. 训练数据集
+下载数据集文件，解压后会生成 `./data/` 文件夹
+```bash
+python downloads.py dataset
+```
+#### 3. 已训练模型
+我们开源了在自建数据集上训练的词法分析模型，可供用户直接使用，可通过下述链接进行下载:
+```bash
+# download baseline model
+python downloads.py model
+```
+### 模型训练
+基于示例的数据集，可通过下面的命令，在训练集 `./data/train.tsv` 上进行训练；
+GPU上单卡训练
+```
+# setting visible devices for training
+export CUDA_VISIBLE_DEVICES=0
+python -u train.py \
+          --device gpu \
+          --dynamic False
+# --device: 使用gpu设备还是cpu设备
+# --dynamic： 是否使用动态图模式进行训练，如果使用静态图训练，设置为True, 动态图设置为False
+```
+GPU上多卡训练
+```
+# setting visible devices for training
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python -m paddle.distributed.launch --selected_gpus=0,1,2,3  train.py \
+          --device gpu \
+          --dynamic False
+# --device: 使用gpu设备还是cpu设备
+# --dynamic: 是否使用动态图模式进行训练，如果使用静态图训练，设置为True, 动态图设置为False
+```
+CPU上训练
+```
+python -u train.py \
+          --device cpu \
+          --dynamic False
+# --device: 使用gpu设备还是cpu设备
+# --dynamic: 是否使用动态图模式进行训练，如果使用静态图训练，设置为True, 动态图设置为False
+```
+### 模型预测
+加载已有的模型，对未知的数据进行预测
+```bash
+python predict.py \
+      --init_from_checkpoint  model_baseline/params \
+      --output_file predict.result  \
+      --mode predict \
+      --device cpu  \
+      --dynamic False
+# --init_from_checkpoint: 初始化模型
+# --output_file: 预测结果文件
+# --device: 使用gpu还是cpu设备
+# --mode: 开启模式, 设置为train时，进行训练，设置为predict时进行预测
+# --dynamic: 是否使用动态图模式进行训练，如果使用静态图训练，设置为True, 动态图设置为False
+```
+### 模型评估
+我们基于自建的数据集训练了一个词法分析的模型，可以直接用这个模型对测试集 `./data/test.tsv` 进行验证，
+```bash
+# baseline model
+python eval.py \
+        --init_from_checkpoint  ./model_baseline/params \
+        --mode predict \
+        --device cpu  \
+        --dynamic False
+# --init_from_checkpoint: 初始化模型
+# --device: 使用gpu还是cpu设备
+# --mode: 开启模式, 设置为train时，进行训练，设置为predict时进行预测
+# --dynamic: 是否使用动态图模式进行训练，如果使用静态图训练，设置为True, 动态图设置为False
+```
+## 3. 进阶使用
+### 任务定义与建模
+序列标注任务的输入是一个字符串（我们后面使用『句子』来指代它），而输出是句子中的词边界和类别。序列标注是词法分析的经典建模方式。我们使用基于 GRU 的网络结构学习特征，将学习到的特征接入 CRF 解码层完成序列标注。CRF 解码层本质上是将传统 CRF 中的线性模型换成了非线性神经网络，基于句子级别的似然概率，因而能够更好的解决标记偏置问题。模型要点如下。
+1. 输入采用 one-hot 方式表示，每个字以一个 id 表示
+2. one-hot 序列通过字表，转换为实向量表示的字向量序列；
+3. 字向量序列作为双向 GRU 的输入，学习输入序列的特征表示，得到新的特性表示序列，我们堆叠了两层双向GRU以增加学习能力；
+4. CRF 以 GRU 学习到的特征为输入，以标记序列为监督信号，实现序列标注。
+可供用户下载的自有数据是对分词、词性标注、专名识别同时标注的联合数据集，进行词性和专名类别标签集合如下表，其中词性标签 24 个（小写字母），专名类别标签 4 个（大写字母）。这里需要说明的是，人名、地名、机构名和时间四个类别，在上表中存在两套标签（PER / LOC / ORG / TIME 和 nr / ns / nt / t），被标注为第二套标签的词，是模型判断为低置信度的人名、地名、机构名和时间词。开发者可以基于这两套标签，在四个类别的准确、召回之间做出自己的权衡。
+| 标签 | 含义     | 标签 | 含义     | 标签 | 含义     | 标签 | 含义     |
+| ---- | -------- | ---- | -------- | ---- | -------- | ---- | -------- |
+| n    | 普通名词 | f    | 方位名词 | s    | 处所名词 | t    | 时间     |
+| nr   | 人名     | ns   | 地名     | nt   | 机构名   | nw   | 作品名   |
+| nz   | 其他专名 | v    | 普通动词 | vd   | 动副词   | vn   | 名动词   |
+| a    | 形容词   | ad   | 副形词   | an   | 名形词   | d    | 副词     |
+| m    | 数量词   | q    | 量词     | r    | 代词     | p    | 介词     |
+| c    | 连词     | u    | 助词     | xc   | 其他虚词 | w    | 标点符号 |
+| PER  | 人名     | LOC  | 地名     | ORG  | 机构名   | TIME | 时间     |
+### 模型原理介绍
+上面介绍的模型原理如下图所示：<br />
+<p align="center">
+<img src="./images/gru-crf-model.png" width = "340" height = "300" /> <br />
+Overall Architecture of GRU-CRF-MODEL
+</p>
+### 数据格式
+训练使用的数据可以由用户根据实际的应用场景，自己组织数据。除了第一行是 `text_a\tlabel` 固定的开头，后面的每行数据都是由两列组成，以制表符分隔，第一列是 utf-8 编码的中文文本，以 `\002` 分割，第二列是对应每个字的标注，以 `\002` 分隔。我们采用 IOB2 标注体系，即以 X-B 作为类型为 X 的词的开始，以 X-I 作为类型为 X 的词的持续，以 O 表示不关注的字（实际上，在词性、专名联合标注中，不存在 O ）。示例如下：
+```text
+除\002了\002他\002续\002任\002十\002二\002届\002政\002协\002委\002员\002,\002马\002化\002腾\002,\002雷\002军\002,\002李\002彦\002宏\002也\002被\002推\002选\002为\002新\002一\002届\002全\002国\002人\002大\002代\002表\002或\002全\002国\002政\002协\002委\002员	p-B\002p-I\002r-B\002v-B\002v-I\002m-B\002m-I\002m-I\002ORG-B\002ORG-I\002n-B\002n-I\002w-B\002PER-B\002PER-I\002PER-I\002w-B\002PER-B\002PER-I\002w-B\002PER-B\002PER-I\002PER-I\002d-B\002p-B\002v-B\002v-I\002v-B\002a-B\002m-B\002m-I\002ORG-B\002ORG-I\002ORG-I\002ORG-I\002n-B\002n-I\002c-B\002n-B\002n-I\002ORG-B\002ORG-I\002n-B\002n-I
+```
+ 我们随同代码一并发布了完全版的模型和相关的依赖数据。但是，由于模型的训练数据过于庞大，我们没有发布训练数据，仅在`data`目录下放置少数样本用以示例输入数据格式。
+ 模型依赖数据包括：
+    1. 输入文本的词典，在`conf`目录下，对应`word.dic`
+    2. 对输入文本中特殊字符进行转换的字典，在`conf`目录下，对应`q2b.dic`
+    3. 标记标签的词典,在`conf`目录下，对应`tag.dic`
+ 在训练和预测阶段，我们都需要进行原始数据的预处理，具体处理工作包括：
+    1. 从原始数据文件中抽取出句子和标签，构造句子序列和标签序列
+    2. 将句子序列中的特殊字符进行转换
+    3. 依据词典获取词对应的整数索引
+### 代码结构说明
+```text
+├── README.md                          # 本文档
+├── data/                                   # 存放数据集的目录
+├── conf/                                   # 词典及程序默认配置的目录
+├── images/                               # 文档图片存放位置
+├── utils/                                   # 常用工具函数
+├── train.py                               # 训练脚本
+├── predict.py                           # 预测脚本
+├── eval.py                               # 词法分析评估的脚本
+├── downloads.py                      # 用于下载数据和模型的脚本
+├── downloads.sh                      # 用于下载数据和模型的脚本
+└──reader.py                           # 文件读取相关函数
+```
+## 4. 其他
+### 在论文中引用 sequence tagging
+如果您的学术工作成果中使用了 sequence tagging，请您增加下述引用。我们非常欣慰sequence tagging模型能够对您的学术工作带来帮助。
+```text
+@article{jiao2018LAC,
+	title={Chinese Lexical Analysis with Deep Bi-GRU-CRF Network},
+	author={Jiao, Zhenyu and Sun, Shuqi and Sun, Ke},
+	journal={arXiv preprint arXiv:1807.01882},
+	year={2018},
+	url={https://arxiv.org/abs/1807.01882}
+}
+```
+### 如何贡献代码
+如果你可以修复某个 issue 或者增加一个新功能，欢迎给我们提交PR。如果对应的PR被接受了，我们将根据贡献的质量和难度 进行打分（0-5分，越高越好）。如果你累计获得了 10 分，可以联系我们获得面试机会或为你写推荐信。
--- a/examples/sequence_tagging/conf/q2b.dic
+++ b/examples/sequence_tagging/conf/q2b.dic
+、	,
+。	.
+—	-
+～	~
+‖	|
+…	.
+‘	'
+’	'
+“	"
+”	"
+〔	(
+〕	)
+〈	<
+〉	>
+「	'
+」	'
+『	"
+』	"
+〖	[
+〗	]
+【	[
+】	]
+∶	:
+＄	$
+！	!
+＂	"
+＃	#
+％	%
+＆	&
+＇	'
+（	(
+）	)
+＊	*
+＋	+
+，	,
+－	-
+．	.
+／	/
+０	0
+１	1
+２	2
+３	3
+４	4
+５	5
+６	6
+７	7
+８	8
+９	9
+：	:
+；	;
+＜	<
+＝	=
+＞	>
+？	?
+＠	@
+Ａ	a
+Ｂ	b
+Ｃ	c
+Ｄ	d
+Ｅ	e
+Ｆ	f
+Ｇ	g
+Ｈ	h
+Ｉ	i
+Ｊ	j
+Ｋ	k
+Ｌ	l
+Ｍ	m
+Ｎ	n
+Ｏ	o
+Ｐ	p
+Ｑ	q
+Ｒ	r
+Ｓ	s
+Ｔ	t
+Ｕ	u
+Ｖ	v
+Ｗ	w
+Ｘ	x
+Ｙ	y
+Ｚ	z
+［	[
+＼	\
+］	]
+＾	^
+＿	_
+｀	`
+ａ	a
+ｂ	b
+ｃ	c
+ｄ	d
+ｅ	e
+ｆ	f
+ｇ	g
+ｈ	h
+ｉ	i
+ｊ	j
+ｋ	k
+ｌ	l
+ｍ	m
+ｎ	n
+ｏ	o
+ｐ	p
+ｑ	q
+ｒ	r
+ｓ	s
+ｔ	t
+ｕ	u
+ｖ	v
+ｗ	w
+ｘ	x
+ｙ	y
+ｚ	z
+｛	{
+｜	|
+｝	}
+￣	~
+〝	"
+〞	"
+﹐	,
+﹑	,
+﹒	.
+﹔	;
+﹕	:
+﹖	?
+﹗	!
+﹙	(
+﹚	)
+﹛	{
+﹜	{
+﹝	[
+﹞	]
+﹟	#
+﹠	&
+﹡	*
+﹢	+
+﹣	-
+﹤	<
+﹥	>
+﹦	=
+﹨	\
+﹩	$
+﹪	%
+﹫	@
+ 	,
+A	a
+B	b
+C	c
+D	d
+E	e
+F	f
+G	g
+H	h
+I	i
+J	j
+K	k
+L	l
+M	m
+N	n
+O	o
+P	p
+Q	q
+R	r
+S	s
+T	t
+U	u
+V	v
+W	w
+X	x
+Y	y
+Z	z
--- a/examples/sequence_tagging/conf/tag.dic
+++ b/examples/sequence_tagging/conf/tag.dic
+0	a-B
+1	a-I
+2	ad-B
+3	ad-I
+4	an-B
+5	an-I
+6	c-B
+7	c-I
+8	d-B
+9	d-I
+10	f-B
+11	f-I
+12	m-B
+13	m-I
+14	n-B
+15	n-I
+16	nr-B
+17	nr-I
+18	ns-B
+19	ns-I
+20	nt-B
+21	nt-I
+22	nw-B
+23	nw-I
+24	nz-B
+25	nz-I
+26	p-B
+27	p-I
+28	q-B
+29	q-I
+30	r-B
+31	r-I
+32	s-B
+33	s-I
+34	t-B
+35	t-I
+36	u-B
+37	u-I
+38	v-B
+39	v-I
+40	vd-B
+41	vd-I
+42	vn-B
+43	vn-I
+44	w-B
+45	w-I
+46	xc-B
+47	xc-I
+48	PER-B
+49	PER-I
+50	LOC-B
+51	LOC-I
+52	ORG-B
+53	ORG-I
+54	TIME-B
+55	TIME-I
+56	O
--- a/examples/sequence_tagging/conf/word.dic
+++ b/examples/sequence_tagging/conf/word.dic
--- a/examples/sequence_tagging/downloads.py
+++ b/examples/sequence_tagging/downloads.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Download script, download dataset and pretrain models.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import io
+import os
+import sys
+import time
+import hashlib
+import tarfile
+import requests
+FILE_INFO = {
+    'BASE_URL': 'https://baidu-nlp.bj.bcebos.com/',
+    'DATA': {
+        'name': 'lexical_analysis-dataset-2.0.0.tar.gz',
+        'md5': '71e4a9a36d0f0177929a1bccedca7dba'
+    },
+    'MODEL': {
+        'name': 'sequence_tagging_dy.tar.gz',
+        'md5': "1125d374c03c8218b6e47325dcf607e3"
+    },
+}
+def usage():
+    desc = ("\nDownload datasets and pretrained models for sequence tagging.\n"
+            "Usage:\n"
+            "   1. python download.py all\n"
+            "   2. python download.py dataset\n"
+            "   3. python download.py model\n")
+    print(desc)
+def md5file(fname):
+    hash_md5 = hashlib.md5()
+    with io.open(fname, "rb") as fin:
+        for chunk in iter(lambda: fin.read(4096), b""):
+            hash_md5.update(chunk)
+    return hash_md5.hexdigest()
+def extract(fname, dir_path):
+    """
+    Extract tar.gz file
+    """
+    try:
+        tar = tarfile.open(fname, "r:gz")
+        file_names = tar.getnames()
+        for file_name in file_names:
+            tar.extract(file_name, dir_path)
+            print(file_name)
+        tar.close()
+    except Exception as e:
+        raise e
+def _download(url, filename, md5sum):
+    """
+    Download file and check md5
+    """
+    retry = 0
+    retry_limit = 3
+    chunk_size = 4096
+    while not (os.path.exists(filename) and md5file(filename) == md5sum):
+        if retry < retry_limit:
+            retry += 1
+        else:
+            raise RuntimeError(
+                "Cannot download dataset ({0}) with retry {1} times.".format(
+                    url, retry_limit))
+        try:
+            start = time.time()
+            size = 0
+            res = requests.get(url, stream=True)
+            filesize = int(res.headers['content-length'])
+            if res.status_code == 200:
+                print("[Filesize]: %0.2f MB" % (filesize / 1024 / 1024))
+                # save by chunk
+                with io.open(filename, "wb") as fout:
+                    for chunk in res.iter_content(chunk_size=chunk_size):
+                        if chunk:
+                            fout.write(chunk)
+                            size += len(chunk)
+                            pr = '>' * int(size * 50 / filesize)
+                            print(
+                                '\r[Process ]: %s%.2f%%' %
+                                (pr, float(size / filesize * 100)),
+                                end='')
+            end = time.time()
+            print("\n[CostTime]: %.2f s" % (end - start))
+        except Exception as e:
+            print(e)
+def download(name, dir_path):
+    url = FILE_INFO['BASE_URL'] + FILE_INFO[name]['name']
+    file_path = os.path.join(dir_path, FILE_INFO[name]['name'])
+    if not os.path.exists(dir_path):
+        os.makedirs(dir_path)
+    # download data
+    print("Downloading : %s" % name)
+    _download(url, file_path, FILE_INFO[name]['md5'])
+    # extract data
+    print("Extracting : %s" % file_path)
+    extract(file_path, dir_path)
+    os.remove(file_path)
+if __name__ == '__main__':
+    if len(sys.argv) != 2:
+        usage()
+        sys.exit(1)
+    pwd = os.path.join(os.path.dirname(__file__), './')
+    ernie_dir = os.path.join(os.path.dirname(__file__), './pretrained')
+    if sys.argv[1] == 'all':
+        download('DATA', pwd)
+        download('MODEL', pwd)
+    if sys.argv[1] == "dataset":
+        download('DATA', pwd)
+    elif sys.argv[1] == "model":
+        download('MODEL', pwd)
+    else:
+        usage()
--- a/examples/sequence_tagging/downloads.sh
+++ b/examples/sequence_tagging/downloads.sh
+#!/bin/bash
+# download baseline model file to ./model_baseline/
+if [ -d ./model_baseline/ ]
+then
+    echo "./model_baseline/ directory already existed, ignore download"
+else
+    wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/sequence_tagging_dy.tar.gz
+    tar xvf sequence_tagging_dy.tar.gz
+    /bin/rm sequence_tagging_dy.tar.gz
+fi
+# download dataset file to ./data/
+if [ -d ./data/ ]
+then
+    echo "./data/ directory already existed, ignore download"
+else
+    wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/lexical_analysis-dataset-2.0.0.tar.gz
+    tar xvf lexical_analysis-dataset-2.0.0.tar.gz
+    /bin/rm lexical_analysis-dataset-2.0.0.tar.gz
+fi
--- a/examples/sequence_tagging/eval.py
+++ b/examples/sequence_tagging/eval.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+SequenceTagging network structure
+"""
+from __future__ import division
+from __future__ import print_function
+import io
+import os
+import sys
+import math
+import argparse
+import numpy as np
+from train import SeqTagging
+from utils.configure import PDConfig
+from utils.check import check_gpu, check_version
+from utils.metrics import chunk_count
+from reader import LacDataset, create_lexnet_data_generator, create_dataloader
+work_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(os.path.join(work_dir, "../"))
+from hapi.model import set_device, Input
+import paddle.fluid as fluid
+from paddle.fluid.optimizer import AdamOptimizer
+from paddle.fluid.layers.utils import flatten
+def main(args):
+    place = set_device(args.device)
+    fluid.enable_dygraph(place) if args.dynamic else None
+    inputs = [Input([None, None], 'int64', name='words'), 
+              Input([None], 'int64', name='length')] 
+    feed_list = None if args.dynamic else [x.forward() for x in inputs]
+    dataset = LacDataset(args)
+    eval_path = args.test_file
+    chunk_evaluator = fluid.metrics.ChunkEvaluator()
+    chunk_evaluator.reset()
+    eval_generator = create_lexnet_data_generator(
+        args, reader=dataset, file_name=eval_path, place=place, mode="test")
+    eval_dataset = create_dataloader(
+        eval_generator, place, feed_list=feed_list)
+    vocab_size = dataset.vocab_size
+    num_labels = dataset.num_labels
+    model = SeqTagging(args, vocab_size, num_labels)
+    optim = AdamOptimizer(
+        learning_rate=args.base_learning_rate,
+        parameter_list=model.parameters())
+    model.mode = "test"
+    model.prepare(inputs=inputs)
+    model.load(args.init_from_checkpoint, skip_mismatch=True)
+    for data in eval_dataset():
+        if len(data) == 1: 
+            batch_data = data[0]
+            targets = np.array(batch_data[2])
+        else: 
+            batch_data = data
+            targets = batch_data[2].numpy()
+        inputs_data = [batch_data[0], batch_data[1]]
+        crf_decode, length = model.test(inputs=inputs_data)
+        num_infer_chunks, num_label_chunks, num_correct_chunks = chunk_count(crf_decode, targets, length, dataset.id2label_dict)
+        chunk_evaluator.update(num_infer_chunks, num_label_chunks, num_correct_chunks)
+    precision, recall, f1 = chunk_evaluator.eval()
+    print("[test] P: %.5f, R: %.5f, F1: %.5f" % (precision, recall, f1))
+if __name__ == '__main__': 
+    args = PDConfig(yaml_file="sequence_tagging.yaml")
+    args.build()
+    args.Print()
+    use_gpu = True if args.device == "gpu" else False
+    check_gpu(use_gpu)
+    check_version()
+    main(args)
--- a/examples/sequence_tagging/images/gru-crf-model.png
+++ b/examples/sequence_tagging/images/gru-crf-model.png
--- a/examples/sequence_tagging/predict.py
+++ b/examples/sequence_tagging/predict.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+SequenceTagging network structure
+"""
+from __future__ import division
+from __future__ import print_function
+import io
+import os
+import sys
+import math
+import argparse
+import numpy as np
+from train import SeqTagging
+from utils.check import check_gpu, check_version
+from utils.configure import PDConfig
+from reader import LacDataset, create_lexnet_data_generator, create_dataloader
+work_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(os.path.join(work_dir, "../"))
+from hapi.model import set_device, Input
+import paddle.fluid as fluid
+from paddle.fluid.optimizer import AdamOptimizer
+from paddle.fluid.layers.utils import flatten
+def main(args):
+    place = set_device(args.device)
+    fluid.enable_dygraph(place) if args.dynamic else None
+    inputs = [Input([None, None], 'int64', name='words'), 
+              Input([None], 'int64', name='length')]
+    feed_list = None if args.dynamic else [x.forward() for x in inputs]
+    dataset = LacDataset(args)
+    predict_path = args.predict_file
+    predict_generator = create_lexnet_data_generator(
+        args, reader=dataset, file_name=predict_path, place=place, mode="predict")
+    predict_dataset = create_dataloader(
+        predict_generator, place, feed_list=feed_list)
+    vocab_size = dataset.vocab_size
+    num_labels = dataset.num_labels
+    model = SeqTagging(args, vocab_size, num_labels)
+    optim = AdamOptimizer(
+        learning_rate=args.base_learning_rate,
+        parameter_list=model.parameters())
+    model.mode = "test"
+    model.prepare(inputs=inputs)
+    model.load(args.init_from_checkpoint, skip_mismatch=True)
+    f = open(args.output_file, "wb")
+    for data in predict_dataset(): 
+        if len(data) == 1: 
+            input_data = data[0]
+        else: 
+            input_data = data
+        results, length = model.test(inputs=flatten(input_data))
+        for i in range(len(results)): 
+            word_len = length[i]
+            word_ids = results[i][: word_len]
+            tags = [dataset.id2label_dict[str(id)] for id in word_ids]
+            f.write("\002".join(tags) + "\n")
+if __name__ == '__main__': 
+    args = PDConfig(yaml_file="sequence_tagging.yaml")
+    args.build()
+    args.Print()
+    use_gpu = True if args.device == "gpu" else False
+    check_gpu(use_gpu)
+    check_version()
+    main(args)
--- a/examples/sequence_tagging/reader.py
+++ b/examples/sequence_tagging/reader.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+SequenceTagging dataset
+"""
+from __future__ import division
+from __future__ import print_function
+import io
+import numpy as np
+import paddle
+class LacDataset(object):
+    """
+    Load lexical analysis dataset
+    """
+    def __init__(self, args):
+        self.word_dict_path = args.word_dict_path
+        self.label_dict_path = args.label_dict_path
+        self.word_rep_dict_path = args.word_rep_dict_path
+        self._load_dict()
+    def _load_dict(self):
+        self.word2id_dict = self.load_kv_dict(
+            self.word_dict_path, reverse=True, value_func=np.int64)
+        self.id2word_dict = self.load_kv_dict(self.word_dict_path)
+        self.label2id_dict = self.load_kv_dict(
+            self.label_dict_path, reverse=True, value_func=np.int64)
+        self.id2label_dict = self.load_kv_dict(self.label_dict_path)
+        if self.word_rep_dict_path is None:
+            self.word_replace_dict = dict()
+        else:
+            self.word_replace_dict = self.load_kv_dict(self.word_rep_dict_path)
+    def load_kv_dict(self,
+                     dict_path,
+                     reverse=False,
+                     delimiter="\t",
+                     key_func=None,
+                     value_func=None):
+        """
+        Load key-value dict from file
+        """
+        result_dict = {}
+        for line in io.open(dict_path, "r", encoding='utf8'):
+            terms = line.strip("\n").split(delimiter)
+            if len(terms) != 2:
+                continue
+            if reverse:
+                value, key = terms
+            else:
+                key, value = terms
+            if key in result_dict:
+                raise KeyError("key duplicated with [%s]" % (key))
+            if key_func:
+                key = key_func(key)
+            if value_func:
+                value = value_func(value)
+            result_dict[key] = value
+        return result_dict
+    @property
+    def vocab_size(self):
+        return max(self.word2id_dict.values()) + 1
+    @property
+    def num_labels(self):
+        return max(self.label2id_dict.values()) + 1
+    def get_num_examples(self, filename):
+        """num of line of file"""
+        return sum(1 for line in io.open(filename, "r", encoding='utf8'))
+    def word_to_ids(self, words):
+        """convert word to word index"""
+        word_ids = []
+        for word in words:
+            word = self.word_replace_dict.get(word, word)
+            if word not in self.word2id_dict:
+                word = "OOV"
+            word_id = self.word2id_dict[word]
+            word_ids.append(word_id)
+        return word_ids
+    def label_to_ids(self, labels):
+        """convert label to label index"""
+        label_ids = []
+        for label in labels:
+            if label not in self.label2id_dict:
+                label = "O"
+            label_id = self.label2id_dict[label]
+            label_ids.append(label_id)
+        return label_ids
+    def file_reader(self,
+                    filename,
+                    mode="train",
+                    batch_size=32,
+                    max_seq_len=126):
+        """
+        yield (word_idx, target_idx) one by one from file,
+            or yield (word_idx, ) in `infer` mode
+        """
+        def wrapper():
+            fread = io.open(filename, "r", encoding="utf-8")
+            if mode == "train": 
+                headline = next(fread)
+                headline = headline.strip().split('\t')
+                assert len(headline) == 2 and headline[0] == "text_a" and headline[
+                    1] == "label"
+                buf = []
+                for line in fread:
+                    words, labels = line.strip("\n").split("\t")
+                    if len(words) < 1:
+                        continue
+                    word_ids = self.word_to_ids(words.split("\002"))
+                    label_ids = self.label_to_ids(labels.split("\002"))
+                    assert len(word_ids) == len(label_ids)
+                    words_len = np.int64(len(word_ids))
+                    word_ids = word_ids[0:max_seq_len]
+                    words_len = np.int64(len(word_ids))
+                    word_ids += [0 for _ in range(max_seq_len - words_len)]
+                    label_ids = label_ids[0:max_seq_len]
+                    label_ids += [0 for _ in range(max_seq_len - words_len)]
+                    assert len(word_ids) == len(label_ids)
+                    yield word_ids, label_ids, words_len
+            elif mode == "test": 
+                headline = next(fread)
+                headline = headline.strip().split('\t')
+                assert len(headline) == 2 and headline[0] == "text_a" and headline[
+                           1] == "label"
+                buf = []
+                for line in fread:
+                    words, labels = line.strip("\n").split("\t")
+                    if len(words) < 1:
+                        continue
+                    word_ids = self.word_to_ids(words.split("\002"))
+                    label_ids = self.label_to_ids(labels.split("\002"))
+                    assert len(word_ids) == len(label_ids)
+                    words_len = np.int64(len(word_ids))
+                    yield word_ids, label_ids, words_len
+            else: 
+                for line in fread: 
+                    words = line.strip("\n").split('\t')[0]
+                    if words == u"text_a": 
+                        continue
+                    if "\002" not in words: 
+                        word_ids = self.word_to_ids(words)
+                    else: 
+                        word_ids = self.word_to_ids(words.split("\002"))
+                    words_len = np.int64(len(word_ids))
+                    yield word_ids, words_len
+            fread.close()
+        return wrapper
+def create_lexnet_data_generator(args, reader, file_name, place, mode="train"): 
+    def padding_data(max_len, batch_data): 
+        padding_batch_data = []
+        for data in batch_data: 
+            data += [0 for _ in range(max_len - len(data))]
+            padding_batch_data.append(data)
+        return padding_batch_data
+    def wrapper(): 
+        if mode == "train": 
+            batch_words, batch_labels, seq_lens = [], [], []
+            for epoch in xrange(args.epoch):
+                for instance in reader.file_reader(
+                        file_name, mode, max_seq_len=args.max_seq_len)():
+                    words, labels, words_len = instance
+                    if len(seq_lens) < args.batch_size:
+                        batch_words.append(words)
+                        batch_labels.append(labels)
+                        seq_lens.append(words_len)
+                    if len(seq_lens) == args.batch_size: 
+                        yield batch_words, seq_lens, batch_labels, batch_labels
+                        batch_words, batch_labels, seq_lens = [], [], []
+            if len(seq_lens) > 0:
+                yield batch_words, seq_lens, batch_labels, batch_labels
+        elif mode == "test": 
+            batch_words, batch_labels, seq_lens, max_len = [], [], [], 0
+            for instance in reader.file_reader(
+                file_name, mode, max_seq_len=args.max_seq_len)():
+                words, labels, words_len = instance
+                max_len = words_len if words_len > max_len else max_len
+                if len(seq_lens) < args.batch_size:
+                    batch_words.append(words)
+                    seq_lens.append(words_len)
+                    batch_labels.append(labels)
+                if len(seq_lens) == args.batch_size: 
+                    padding_batch_words = padding_data(max_len, batch_words)
+                    padding_batch_labels = padding_data(max_len, batch_labels)
+                    yield padding_batch_words, seq_lens, padding_batch_labels, padding_batch_labels
+                    batch_words, batch_labels, seq_lens, max_len = [], [], [], 0
+            if len(seq_lens) > 0: 
+                padding_batch_words = padding_data(max_len, batch_words)
+                padding_batch_labels = padding_data(max_len, batch_labels)
+                yield padding_batch_words, seq_lens, padding_batch_labels, padding_batch_labels
+        else: 
+            batch_words, seq_lens, max_len = [], [], 0
+            for instance in reader.file_reader(
+                   file_name, mode, max_seq_len=args.max_seq_len)():
+                words, words_len = instance
+                if len(seq_lens) < args.batch_size:
+                    batch_words.append(words)
+                    seq_lens.append(words_len)
+                    max_len = words_len if words_len > max_len else max_len
+                if len(seq_lens) == args.batch_size: 
+                    padding_batch_words = padding_data(max_len, batch_words)
+                    yield padding_batch_words, seq_lens
+                    batch_words, seq_lens, max_len = [], [], 0
+            if len(seq_lens) > 0: 
+                padding_batch_words = padding_data(max_len, batch_words)
+                yield padding_batch_words, seq_lens
+    return wrapper
+def create_dataloader(generator, place, feed_list=None):
+    if not feed_list:
+        data_loader = paddle.io.DataLoader.from_generator(
+            capacity=50,
+            use_double_buffer=True,
+            iterable=True,
+            return_list=True)
+    else:
+        data_loader = paddle.io.DataLoader.from_generator(
+            feed_list=feed_list,
+            capacity=50,
+            use_double_buffer=True,
+            iterable=True,
+            return_list=True)
+    data_loader.set_batch_generator(generator, places=place)
+    return data_loader
--- a/examples/sequence_tagging/sequence_tagging.yaml
+++ b/examples/sequence_tagging/sequence_tagging.yaml
+word_dict_path: "./conf/word.dic"
+label_dict_path: "./conf/tag.dic"
+word_rep_dict_path: "./conf/q2b.dic"
+device: "cpu"
+dynamic: True
+epoch: 10
+base_learning_rate: 0.001
+word_emb_dim: 128
+grnn_hidden_dim: 128
+bigru_num: 2
+emb_learning_rate: 1.0
+crf_learning_rate: 1.0
+batch_size: 300
+max_seq_len: 126
+num_devices: 1
+save_dir: "model"
+init_from_checkpoint: "model_baseline/params"
+init_from_pretrain_model: ""
+save_freq: 1
+eval_freq: 1
+output_file: "predict.result"
+test_file: "./data/test.tsv"
+train_file: "./data/train.tsv"
+predict_file: "./data/infer.tsv"
+mode: "train"
--- a/examples/sequence_tagging/train.py
+++ b/examples/sequence_tagging/train.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+SequenceTagging network structure
+"""
+from __future__ import division
+from __future__ import print_function
+import io
+import os
+import sys
+import math
+import argparse
+import numpy as np
+work_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(os.path.join(work_dir, "../"))
+from hapi.metrics import Metric
+from hapi.model import Model, Input, Loss, set_device
+from hapi.text.text import SequenceTagging
+from utils.check import check_gpu, check_version
+from utils.configure import PDConfig
+from reader import LacDataset, create_lexnet_data_generator, create_dataloader 
+import paddle.fluid as fluid
+from paddle.fluid.optimizer import AdamOptimizer
+class SeqTagging(Model):
+    def __init__(self, args, vocab_size, num_labels, length=None):
+        super(SeqTagging, self).__init__()
+        """
+        define the lexical analysis network structure
+        word: stores the input of the model
+        for_infer: a boolean value, indicating if the model to be created is for training or predicting.
+        return:
+            for infer: return the prediction
+            otherwise: return the prediction
+        """
+        self.mode_type = args.mode
+        self.word_emb_dim = args.word_emb_dim
+        self.vocab_size = vocab_size
+        self.num_labels = num_labels
+        self.grnn_hidden_dim = args.grnn_hidden_dim
+        self.emb_lr = args.emb_learning_rate if 'emb_learning_rate' in dir(
+            args) else 1.0
+        self.crf_lr = args.emb_learning_rate if 'crf_learning_rate' in dir(
+            args) else 1.0
+        self.bigru_num = args.bigru_num
+        self.batch_size = args.batch_size
+        self.init_bound = 0.1
+        self.length=length
+        self.sequence_tagging = SequenceTagging(
+                        vocab_size=self.vocab_size,
+                        num_labels=self.num_labels,
+                        batch_size=self.batch_size,
+                        word_emb_dim=self.word_emb_dim,
+                        grnn_hidden_dim=self.grnn_hidden_dim,
+                        emb_learning_rate=self.emb_lr,
+                        crf_learning_rate=self.crf_lr,
+                        bigru_num=self.bigru_num,
+                        init_bound=self.init_bound,
+                        length=self.length)
+    def forward(self, *inputs):
+        """
+        Configure the network
+        """
+        word = inputs[0]
+        lengths = inputs[1]
+        if self.mode_type == "train" or self.mode_type == "test": 
+            target = inputs[2]
+            outputs = self.sequence_tagging(word, lengths, target)
+        else: 
+            outputs = self.sequence_tagging(word, lengths)
+        return outputs
+class Chunk_eval(fluid.dygraph.Layer):
+    def __init__(self,
+                 num_chunk_types,
+                 chunk_scheme,
+                 excluded_chunk_types=None):
+        super(Chunk_eval, self).__init__()
+        self.num_chunk_types = num_chunk_types
+        self.chunk_scheme = chunk_scheme
+        self.excluded_chunk_types = excluded_chunk_types
+    def forward(self, input, label, seq_length=None):
+        precision = self._helper.create_variable_for_type_inference(
+            dtype="float32")
+        recall = self._helper.create_variable_for_type_inference(
+            dtype="float32")
+        f1_score = self._helper.create_variable_for_type_inference(
+            dtype="float32")
+        num_infer_chunks = self._helper.create_variable_for_type_inference(
+            dtype="int64")
+        num_label_chunks = self._helper.create_variable_for_type_inference(
+            dtype="int64")
+        num_correct_chunks = self._helper.create_variable_for_type_inference(
+            dtype="int64")
+        this_input = {"Inference": input, "Label": label}
+        if seq_length is not None:
+            this_input["SeqLength"] = seq_length
+        self._helper.append_op(
+            type='chunk_eval',
+            inputs=this_input,
+            outputs={
+                "Precision": [precision],
+                "Recall": [recall],
+                "F1-Score": [f1_score],
+                "NumInferChunks": [num_infer_chunks],
+                "NumLabelChunks": [num_label_chunks],
+                "NumCorrectChunks": [num_correct_chunks]
+            },
+            attrs={
+                "num_chunk_types": self.num_chunk_types,
+                "chunk_scheme": self.chunk_scheme,
+                "excluded_chunk_types": self.excluded_chunk_types or []
+            })
+        return (num_infer_chunks, num_label_chunks, num_correct_chunks)
+class LacLoss(Loss):
+    def __init__(self):
+        super(LacLoss, self).__init__()
+        pass
+    def forward(self, outputs, labels):
+        avg_cost = outputs[1]
+        return avg_cost
+class ChunkEval(Metric):
+    def __init__(self, num_labels, name=None, *args, **kwargs):
+        super(ChunkEval, self).__init__(*args, **kwargs)
+        self._init_name(name)
+        self.chunk_eval = Chunk_eval(
+            int(math.ceil((num_labels - 1) / 2.0)), "IOB")
+        self.reset()
+    def add_metric_op(self, *args): 
+        crf_decode = args[0]
+        lengths = args[2]
+        label = args[3]
+        (num_infer_chunks, num_label_chunks,
+         num_correct_chunks) = self.chunk_eval(
+             input=crf_decode, label=label, seq_length=lengths)
+        return [num_infer_chunks, num_label_chunks, num_correct_chunks]
+    def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks,
+               *args, **kwargs):
+        self.infer_chunks_total += num_infer_chunks
+        self.label_chunks_total += num_label_chunks
+        self.correct_chunks_total += num_correct_chunks
+        precision = float(
+            num_correct_chunks) / num_infer_chunks if num_infer_chunks else 0
+        recall = float(
+            num_correct_chunks) / num_label_chunks if num_label_chunks else 0
+        f1_score = float(2 * precision * recall) / (
+            precision + recall) if num_correct_chunks else 0
+        return [precision, recall, f1_score]
+    def reset(self):
+        self.infer_chunks_total = 0
+        self.label_chunks_total = 0
+        self.correct_chunks_total = 0
+    def accumulate(self):
+        precision = float(
+            self.correct_chunks_total
+        ) / self.infer_chunks_total if self.infer_chunks_total else 0
+        recall = float(
+            self.correct_chunks_total
+        ) / self.label_chunks_total if self.label_chunks_total else 0
+        f1_score = float(2 * precision * recall) / (
+            precision + recall) if self.correct_chunks_total else 0
+        res = [precision, recall, f1_score]
+        return res
+    def _init_name(self, name):
+        name = name or 'chunk eval'
+        self._name = ['precision', 'recall', 'F1']
+    def name(self):
+        return self._name
+def main(args):
+    place = set_device(args.device)
+    fluid.enable_dygraph(place) if args.dynamic else None
+    inputs = [Input([None, None], 'int64', name='words'),
+              Input([None], 'int64', name='length'), 
+              Input([None, None], 'int64', name='target')]
+    labels = [Input([None, None], 'int64', name='labels')]
+    feed_list = None if args.dynamic else [x.forward() for x in inputs + labels]
+    dataset = LacDataset(args)
+    train_path = args.train_file
+    test_path = args.test_file
+    train_generator = create_lexnet_data_generator(
+        args, reader=dataset, file_name=train_path, place=place, mode="train")
+    test_generator = create_lexnet_data_generator(
+        args, reader=dataset, file_name=test_path, place=place, mode="test")
+    train_dataset = create_dataloader(
+        train_generator, place, feed_list=feed_list)
+    test_dataset = create_dataloader(
+        test_generator, place, feed_list=feed_list)
+    vocab_size = dataset.vocab_size
+    num_labels = dataset.num_labels
+    model = SeqTagging(args, vocab_size, num_labels)
+    optim = AdamOptimizer(
+        learning_rate=args.base_learning_rate,
+        parameter_list=model.parameters())
+    model.prepare(
+        optim,
+        LacLoss(),
+        ChunkEval(num_labels),
+        inputs=inputs,
+        labels=labels,
+        device=args.device)
+    if args.init_from_checkpoint:
+        model.load(args.init_from_checkpoint)
+    if args.init_from_pretrain_model:
+        model.load(args.init_from_pretrain_model, reset_optimizer=True)
+    model.fit(train_dataset,
+              test_dataset,
+              epochs=args.epoch,
+              batch_size=args.batch_size,
+              eval_freq=args.eval_freq,
+              save_freq=args.save_freq,
+              save_dir=args.save_dir)
+if __name__ == '__main__':
+    args = PDConfig(yaml_file="sequence_tagging.yaml")
+    args.build()
+    args.Print()
+    use_gpu = True if args.device == "gpu" else False
+    check_gpu(use_gpu)
+    check_version()
+    main(args)
--- a/examples/sequence_tagging/utils/__init__.py
+++ b/examples/sequence_tagging/utils/__init__.py
--- a/examples/sequence_tagging/utils/check.py
+++ b/examples/sequence_tagging/utils/check.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import sys
+import paddle.fluid as fluid
+__all__ = ['check_gpu', 'check_version']
+def check_gpu(use_gpu):
+    """
+     Log error and exit when set use_gpu=true in paddlepaddle
+     cpu version.
+     """
+    err = "Config use_gpu cannot be set as true while you are " \
+          "using paddlepaddle cpu version ! \nPlease try: \n" \
+          "\t1. Install paddlepaddle-gpu to run model on GPU \n" \
+          "\t2. Set use_gpu as false in config file to run " \
+          "model on CPU"
+    try:
+        if use_gpu and not fluid.is_compiled_with_cuda():
+            print(err)
+            sys.exit(1)
+    except Exception as e:
+        pass
+def check_version():
+    """
+    Log error and exit when the installed version of paddlepaddle is
+    not satisfied.
+    """
+    err = "PaddlePaddle version 1.6 or higher is required, " \
+          "or a suitable develop version is satisfied as well. \n" \
+          "Please make sure the version is good with your code." \
+    try:
+        fluid.require_version('1.7.0')
+    except Exception as e:
+        print(err)
+        sys.exit(1)
--- a/examples/sequence_tagging/utils/configure.py
+++ b/examples/sequence_tagging/utils/configure.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import sys
+import argparse
+import json
+import yaml
+import six
+import logging
+logging_only_message = "%(message)s"
+logging_details = "%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s"
+class JsonConfig(object):
+    """
+    A high-level api for handling json configure file.
+    """
+    def __init__(self, config_path):
+        self._config_dict = self._parse(config_path)
+    def _parse(self, config_path):
+        try:
+            with open(config_path) as json_file:
+                config_dict = json.load(json_file)
+        except:
+            raise IOError("Error in parsing bert model config file '%s'" %
+                          config_path)
+        else:
+            return config_dict
+    def __getitem__(self, key):
+        return self._config_dict[key]
+    def print_config(self):
+        for arg, value in sorted(six.iteritems(self._config_dict)):
+            print('%s: %s' % (arg, value))
+        print('------------------------------------------------')
+class ArgumentGroup(object):
+    def __init__(self, parser, title, des):
+        self._group = parser.add_argument_group(title=title, description=des)
+    def add_arg(self, name, type, default, help, **kwargs):
+        type = str2bool if type == bool else type
+        self._group.add_argument(
+            "--" + name,
+            default=default,
+            type=type,
+            help=help + ' Default: %(default)s.',
+            **kwargs)
+class ArgConfig(object):
+    """
+    A high-level api for handling argument configs.
+    """
+    def __init__(self):
+        parser = argparse.ArgumentParser()
+        train_g = ArgumentGroup(parser, "training", "training options.")
+        train_g.add_arg("epoch", int, 3, "Number of epoches for fine-tuning.")
+        train_g.add_arg("learning_rate", float, 5e-5,
+                        "Learning rate used to train with warmup.")
+        train_g.add_arg(
+            "lr_scheduler",
+            str,
+            "linear_warmup_decay",
+            "scheduler of learning rate.",
+            choices=['linear_warmup_decay', 'noam_decay'])
+        train_g.add_arg("weight_decay", float, 0.01,
+                        "Weight decay rate for L2 regularizer.")
+        train_g.add_arg(
+            "warmup_proportion", float, 0.1,
+            "Proportion of training steps to perform linear learning rate warmup for."
+        )
+        train_g.add_arg("save_steps", int, 1000,
+                        "The steps interval to save checkpoints.")
+        train_g.add_arg("use_fp16", bool, False,
+                        "Whether to use fp16 mixed precision training.")
+        train_g.add_arg(
+            "loss_scaling", float, 1.0,
+            "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled."
+        )
+        train_g.add_arg("pred_dir", str, None,
+                        "Path to save the prediction results")
+        log_g = ArgumentGroup(parser, "logging", "logging related.")
+        log_g.add_arg("skip_steps", int, 10,
+                      "The steps interval to print loss.")
+        log_g.add_arg("verbose", bool, False, "Whether to output verbose log.")
+        run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
+        run_type_g.add_arg("use_cuda", bool, True,
+                           "If set, use GPU for training.")
+        run_type_g.add_arg(
+            "use_fast_executor", bool, False,
+            "If set, use fast parallel executor (in experiment).")
+        run_type_g.add_arg(
+            "num_iteration_per_drop_scope", int, 1,
+            "Ihe iteration intervals to clean up temporary variables.")
+        run_type_g.add_arg("do_train", bool, True,
+                           "Whether to perform training.")
+        run_type_g.add_arg("do_predict", bool, True,
+                           "Whether to perform prediction.")
+        custom_g = ArgumentGroup(parser, "customize", "customized options.")
+        self.custom_g = custom_g
+        self.parser = parser
+    def add_arg(self, name, dtype, default, descrip):
+        self.custom_g.add_arg(name, dtype, default, descrip)
+    def build_conf(self):
+        return self.parser.parse_args()
+def str2bool(v):
+    # because argparse does not support to parse "true, False" as python
+    # boolean directly
+    return v.lower() in ("true", "t", "1")
+def print_arguments(args, log=None):
+    if not log:
+        print('-----------  Configuration Arguments -----------')
+        for arg, value in sorted(six.iteritems(vars(args))):
+            print('%s: %s' % (arg, value))
+        print('------------------------------------------------')
+    else:
+        log.info('-----------  Configuration Arguments -----------')
+        for arg, value in sorted(six.iteritems(vars(args))):
+            log.info('%s: %s' % (arg, value))
+        log.info('------------------------------------------------')
+class PDConfig(object):
+    """
+    A high-level API for managing configuration files in PaddlePaddle.
+    Can jointly work with command-line-arugment, json files and yaml files.
+    """
+    def __init__(self, json_file="", yaml_file="", fuse_args=True):
+        """
+            Init funciton for PDConfig.
+            json_file: the path to the json configure file.
+            yaml_file: the path to the yaml configure file.
+            fuse_args: if fuse the json/yaml configs with argparse.
+        """
+        assert isinstance(json_file, str)
+        assert isinstance(yaml_file, str)
+        if json_file != "" and yaml_file != "":
+            raise Warning(
+                "json_file and yaml_file can not co-exist for now. please only use one configure file type."
+            )
+            return
+        self.args = None
+        self.arg_config = {}
+        self.json_config = {}
+        self.yaml_config = {}
+        parser = argparse.ArgumentParser()
+        self.default_g = ArgumentGroup(parser, "default", "default options.")
+        self.yaml_g = ArgumentGroup(parser, "yaml", "options from yaml.")
+        self.json_g = ArgumentGroup(parser, "json", "options from json.")
+        self.com_g = ArgumentGroup(parser, "custom", "customized options.")
+        self.default_g.add_arg("do_train", bool, False,
+                               "Whether to perform training.")
+        self.default_g.add_arg("do_predict", bool, False,
+                               "Whether to perform predicting.")
+        self.default_g.add_arg("do_eval", bool, False,
+                               "Whether to perform evaluating.")
+        self.default_g.add_arg("do_save_inference_model", bool, False,
+                               "Whether to perform model saving for inference.")
+        # NOTE: args for profiler
+        self.default_g.add_arg("is_profiler", int, 0, "the switch of profiler tools. (used for benchmark)")
+        self.default_g.add_arg("profiler_path", str, './', "the profiler output file path. (used for benchmark)")
+        self.default_g.add_arg("max_iter", int, 0, "the max train batch num.(used for benchmark)")
+        self.parser = parser
+        if json_file != "":
+            self.load_json(json_file, fuse_args=fuse_args)
+        if yaml_file:
+            self.load_yaml(yaml_file, fuse_args=fuse_args)
+    def load_json(self, file_path, fuse_args=True):
+        if not os.path.exists(file_path):
+            raise Warning("the json file %s does not exist." % file_path)
+            return
+        with open(file_path, "r") as fin:
+            self.json_config = json.loads(fin.read())
+            fin.close()
+        if fuse_args:
+            for name in self.json_config:
+                if isinstance(self.json_config[name], list):
+                    self.json_g.add_arg(
+                        name,
+                        type(self.json_config[name][0]),
+                        self.json_config[name],
+                        "This is from %s" % file_path,
+                        nargs=len(self.json_config[name]))
+                    continue
+                if not isinstance(self.json_config[name], int) \
+                    and not isinstance(self.json_config[name], float) \
+                    and not isinstance(self.json_config[name], str) \
+                    and not isinstance(self.json_config[name], bool):
+                    continue
+                self.json_g.add_arg(name,
+                                    type(self.json_config[name]),
+                                    self.json_config[name],
+                                    "This is from %s" % file_path)
+    def load_yaml(self, file_path, fuse_args=True):
+        if not os.path.exists(file_path):
+            raise Warning("the yaml file %s does not exist." % file_path)
+            return
+        with open(file_path, "r") as fin:
+            self.yaml_config = yaml.load(fin, Loader=yaml.SafeLoader)
+            fin.close()
+        if fuse_args:
+            for name in self.yaml_config:
+                if isinstance(self.yaml_config[name], list):
+                    self.yaml_g.add_arg(
+                        name,
+                        type(self.yaml_config[name][0]),
+                        self.yaml_config[name],
+                        "This is from %s" % file_path,
+                        nargs=len(self.yaml_config[name]))
+                    continue
+                if not isinstance(self.yaml_config[name], int) \
+                    and not isinstance(self.yaml_config[name], float) \
+                    and not isinstance(self.yaml_config[name], str) \
+                    and not isinstance(self.yaml_config[name], bool):
+                    continue
+                self.yaml_g.add_arg(name,
+                                    type(self.yaml_config[name]),
+                                    self.yaml_config[name],
+                                    "This is from %s" % file_path)
+    def build(self):
+        self.args = self.parser.parse_args()
+        self.arg_config = vars(self.args)
+    def __add__(self, new_arg):
+        assert isinstance(new_arg, list) or isinstance(new_arg, tuple)
+        assert len(new_arg) >= 3
+        assert self.args is None
+        name = new_arg[0]
+        dtype = new_arg[1]
+        dvalue = new_arg[2]
+        desc = new_arg[3] if len(
+            new_arg) == 4 else "Description is not provided."
+        self.com_g.add_arg(name, dtype, dvalue, desc)
+        return self
+    def __getattr__(self, name):
+        if name in self.arg_config:
+            return self.arg_config[name]
+        if name in self.json_config:
+            return self.json_config[name]
+        if name in self.yaml_config:
+            return self.yaml_config[name]
+        raise Warning("The argument %s is not defined." % name)
+    def Print(self):
+        print("-" * 70)
+        for name in self.arg_config:
+            print("%s:\t\t\t\t%s" % (str(name), str(self.arg_config[name])))
+        for name in self.json_config:
+            if name not in self.arg_config:
+                print("%s:\t\t\t\t%s" %
+                      (str(name), str(self.json_config[name])))
+        for name in self.yaml_config:
+            if name not in self.arg_config:
+                print("%s:\t\t\t\t%s" %
+                      (str(name), str(self.yaml_config[name])))
+        print("-" * 70)
+if __name__ == "__main__":
+    """
+    pd_config = PDConfig(json_file = "./test/bert_config.json")
+    pd_config.build()
+    print(pd_config.do_train)
+    print(pd_config.hidden_size)
+    pd_config = PDConfig(yaml_file = "./test/bert_config.yaml")
+    pd_config.build()
+    print(pd_config.do_train)
+    print(pd_config.hidden_size)
+    """
+    pd_config = PDConfig(yaml_file="./test/bert_config.yaml")
+    pd_config += ("my_age", int, 18, "I am forever 18.")
+    pd_config.build()
+    print(pd_config.do_train)
+    print(pd_config.hidden_size)
+    print(pd_config.my_age)
--- a/examples/sequence_tagging/utils/metrics.py
+++ b/examples/sequence_tagging/utils/metrics.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import sys
+import paddle.fluid as fluid
+__all__ = ['chunk_count', "build_chunk"]
+def build_chunk(data_list, id2label_dict): 
+    """
+    Assembly entity
+    """
+    tag_list = [id2label_dict.get(str(id)) for id in data_list]
+    ner_dict = {}
+    ner_str = ""
+    ner_start = 0
+    for i in range(len(tag_list)): 
+        tag = tag_list[i]
+        if tag == u"O": 
+            if i != 0: 
+                key = "%d_%d" % (ner_start, i - 1)
+                ner_dict[key] = ner_str
+            ner_start = i
+            ner_str = tag 
+        elif tag.endswith(u"B"): 
+            if i != 0: 
+                key = "%d_%d" % (ner_start, i - 1)
+                ner_dict[key] = ner_str
+            ner_start = i
+            ner_str = tag.split('-')[0]
+        elif tag.endswith(u"I"): 
+            if tag.split('-')[0] != ner_str: 
+                if i != 0: 
+                    key = "%d_%d" % (ner_start, i - 1)
+                    ner_dict[key] = ner_str
+                ner_start = i
+                ner_str = tag.split('-')[0]
+    return ner_dict
+def chunk_count(infer_numpy, label_numpy, seq_len, id2label_dict):
+    """
+    calculate num_correct_chunks num_error_chunks total_num for metrics
+    """
+    num_infer_chunks, num_label_chunks, num_correct_chunks = 0, 0, 0
+    assert infer_numpy.shape[0] == label_numpy.shape[0]
+    for i in range(infer_numpy.shape[0]): 
+        infer_list = infer_numpy[i][: seq_len[i]]
+        label_list = label_numpy[i][: seq_len[i]]
+        infer_dict = build_chunk(infer_list, id2label_dict)
+        num_infer_chunks += len(infer_dict)
+        label_dict = build_chunk(label_list, id2label_dict)
+        num_label_chunks += len(label_dict)
+        for key in infer_dict: 
+            if key in label_dict and label_dict[key] == infer_dict[key]: 
+                num_correct_chunks += 1
+    return num_infer_chunks, num_label_chunks, num_correct_chunks
--- a/tsm/README.md
+++ b/tsm/README.md
--- a/tsm/check.py
+++ b/tsm/check.py
--- a/tsm/dataset/README.md
+++ b/tsm/dataset/README.md
--- a/tsm/dataset/kinetics/generate_label.py
+++ b/tsm/dataset/kinetics/generate_label.py
--- a/tsm/dataset/kinetics/video2pkl.py
+++ b/tsm/dataset/kinetics/video2pkl.py
--- a/tsm/images/temporal_shift.png
+++ b/tsm/images/temporal_shift.png
--- a/tsm/infer.py
+++ b/tsm/infer.py
@@ -19,10 +19,10 @@ import os
 import argparse
 import numpy as np
-from model import Input, set_device
+from hapi.model import Input, set_device
-from models import tsm_resnet50
 from check import check_gpu, check_version
+from modeling import tsm_resnet50
 from kinetics_dataset import KineticsDataset
 from transforms import *

--- a/tsm/kinetics_dataset.py
+++ b/tsm/kinetics_dataset.py
@@ -26,7 +26,7 @@ except ImportError:
    import pickle
    from io import BytesIO
-from paddle.fluid.io import Dataset
+from paddle.io import Dataset
 import logging
 logger = logging.getLogger(__name__)
@@ -100,19 +100,12 @@ class KineticsDataset(Dataset):
    def __getitem__(self, idx):
        pickle_path = os.path.join(self.pickle_dir, self.pickle_paths[idx])
-        try:
+        if six.PY2:
-            if six.PY2:
+            data = pickle.load(open(pickle_path, 'rb'))
-                data = pickle.load(open(pickle_path, 'rb'))
+        else:
-            else:
+            data = pickle.load(open(pickle_path, 'rb'), encoding='bytes')
-                data = pickle.load(open(pickle_path, 'rb'), encoding='bytes')
+        vid, label, frames = data
-            vid, label, frames = data
-            if len(frames) < 1:
-                logger.error("{} contains no frame".format(pickle_path))
-                sys.exit(-1)
-        except Exception as e:
-            logger.error("Load {} failed: {}".format(pickle_path, e))
-            sys.exit(-1)
        if self.label_list is not None:
            label = self.label_list.index(label)

--- a/tsm/main.py
+++ b/tsm/main.py
@@ -22,10 +22,10 @@ import numpy as np
 from paddle import fluid
 from paddle.fluid.dygraph.parallel import ParallelEnv
-from model import Model, CrossEntropy, Input, set_device
+from hapi.model import Model, CrossEntropy, Input, set_device
-from metrics import Accuracy
+from hapi.metrics import Accuracy
-from models import tsm_resnet50
+from modeling import tsm_resnet50
 from check import check_gpu, check_version
 from kinetics_dataset import KineticsDataset
 from transforms import *

--- a/models/tsm.py
+++ b/models/tsm.py
@@ -17,8 +17,8 @@ import paddle.fluid as fluid
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
-from model import Model
+from hapi.model import Model
-from .download import get_weights_path
+from hapi.download import get_weights_path
 __all__ = ["TSM_ResNet", "tsm_resnet50"]
@@ -196,9 +196,17 @@ def _tsm_resnet(num_layers, seg_num=8, num_classes=400, pretrained=True):
        weight_path = get_weights_path(*(pretrain_infos[num_layers]))
        assert weight_path.endswith('.pdparams'), \
                "suffix of weight must be .pdparams"
-        model.load(weight_path[:-9])
+        model.load(weight_path)
    return model
 def tsm_resnet50(seg_num=8, num_classes=400, pretrained=True):
+    """TSM model with 50-layer ResNet as backbone
+    Args:
+        seg_num (int): segment number of each video sample. Default 8.
+        num_classes (int): video class number. Default 400.
+        pretrained (bool): If True, returns a model with pre-trained model
+            on COCO, default True
+    """
    return _tsm_resnet(50, seg_num, num_classes, pretrained)
--- a/tsm/transforms.py
+++ b/tsm/transforms.py
--- a/examples/yolov3/.gitignore
+++ b/examples/yolov3/.gitignore
+dataset/voc*
+pretrain_weights/darknet53_pretrained.pdparams
--- a/yolov3/README.md
+++ b/yolov3/README.md
@@ -101,11 +101,10 @@ YOLOv3 的网络结构由基础特征提取网络、multi-scale特征融合层
 ### 模型训练
-数据准备完毕后，可使用`main.py`脚本启动训练和评估，如下脚本会自动每epoch交替进行训练和模型评估，并将checkpoint默认保存在`yolo_checkpoint`目录下。
+数据准备完成后，可使用`main.py`脚本启动训练和评估，如下脚本会自动每epoch交替进行训练和模型评估，并将checkpoint默认保存在`yolo_checkpoint`目录下。
 YOLOv3模型训练总batch_size为64训练，以下以使用4卡Tesla P40每卡batch_size为16训练介绍训练方式。对于静态图和动态图，多卡训练中`--batch_size`为每卡上的batch_size，即总batch_size为`--batch_size`乘以卡数。
 `main.py`脚本参数可通过如下命令查询
 ```bash

--- a/yolov3/coco.py
+++ b/yolov3/coco.py
@@ -18,9 +18,8 @@ from __future__ import print_function
 import os
 import cv2
 import numpy as np
-from pycocotools.coco import COCO
-from paddle.fluid.io import Dataset
+from paddle.io import Dataset
 import logging
 logger = logging.getLogger(__name__)
@@ -91,6 +90,7 @@ class COCODataset(Dataset):
        self._load_roidb_and_cname2cid()
    def _load_roidb_and_cname2cid(self):
+        from pycocotools.coco import COCO
        assert self._anno_path.endswith('.json'), \
            'invalid coco annotation file: ' + anno_path
        coco = COCO(self._anno_path)
@@ -186,30 +186,31 @@ class COCODataset(Dataset):
            data = np.frombuffer(f.read(), dtype='uint8')
            im = cv2.imdecode(data, 1)
            im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
-        im_info = np.array([roidb['im_id'][0], roidb['h'], roidb['w']], dtype='int32')
+        im_id = roidb['im_id']
+        im_shape = np.array([roidb['h'], roidb['w']], dtype='int32')
        gt_bbox = roidb['gt_bbox']
        gt_class = roidb['gt_class']
        gt_score = roidb['gt_score']
-        return im_info, im, gt_bbox, gt_class, gt_score
+        return im_id, im_shape, im, gt_bbox, gt_class, gt_score
    def __getitem__(self, idx):
-        im_info, im, gt_bbox, gt_class, gt_score = self._getitem_by_index(idx)
+        im_id, im_shape, im, gt_bbox, gt_class, gt_score = self._getitem_by_index(idx)
        if self._mixup:
            mixup_idx = idx + np.random.randint(1, self.__len__())
            mixup_idx %= self.__len__()
-            _, mixup_im, mixup_bbox, mixup_class, _ = \
+            _, _, mixup_im, mixup_bbox, mixup_class, _ = \
                            self._getitem_by_index(mixup_idx)
-            im, gt_bbox, gt_class, gt_score = \
+            im_shape, im, gt_bbox, gt_class, gt_score = \
                    self._mixup_image(im, gt_bbox, gt_class, mixup_im,
                                      mixup_bbox, mixup_class)
        if self._transform:
-            im_info, im, gt_bbox, gt_class, gt_score = \
+            im_id, im_shape, im, gt_bbox, gt_class, gt_score = \
-                    self._transform(im_info, im, gt_bbox, gt_class, gt_score)
+                self._transform(im_id, im_shape, im, gt_bbox, gt_class, gt_score)
-        return [im_info, im, gt_bbox, gt_class, gt_score]
+        return [im_id, im_shape, im, gt_bbox, gt_class, gt_score]
    def _mixup_image(self, img1, bbox1, class1, img2, bbox2, class2):
        factor = np.random.beta(self._alpha, self._beta)
@@ -234,7 +235,9 @@ class COCODataset(Dataset):
        score2 = np.ones_like(class2, dtype="float32") * (1.0 - factor)
        gt_score = np.concatenate((score1, score2), axis=0)
-        return img, gt_bbox, gt_class, gt_score
+        im_shape = np.array([h, w], dtype='int32')
+        return im_shape, img, gt_bbox, gt_class, gt_score
    @property
    def mixup(self):

--- a/yolov3/coco_metric.py
+++ b/yolov3/coco_metric.py
--- a/yolov3/dataset/download_voc.py
+++ b/yolov3/dataset/download_voc.py
@@ -17,7 +17,7 @@ import os.path as osp
 import sys
 import tarfile
-from models.download import _download
+from hapi.download import _download
 import logging
 logger = logging.getLogger(__name__)

--- a/yolov3/image/YOLOv3.jpg
+++ b/yolov3/image/YOLOv3.jpg
--- a/yolov3/image/YOLOv3_structure.jpg
+++ b/yolov3/image/YOLOv3_structure.jpg
--- a/yolov3/image/dog.jpg
+++ b/yolov3/image/dog.jpg
--- a/yolov3/infer.py
+++ b/yolov3/infer.py
@@ -22,13 +22,13 @@ from PIL import Image
 from paddle import fluid
 from paddle.fluid.optimizer import Momentum
-from paddle.fluid.io import DataLoader
+from paddle.io import DataLoader
-from model import Model, Input, set_device
+from hapi.model import Model, Input, set_device
-from models import yolov3_darknet53, YoloLoss
-from coco import COCODataset
+from modeling import yolov3_darknet53, YoloLoss
 from transforms import *
 from visualizer import draw_bbox
 import logging
@@ -65,7 +65,8 @@ def main():
    device = set_device(FLAGS.device)
    fluid.enable_dygraph(device) if FLAGS.dynamic else None
-    inputs = [Input([None, 3], 'int32', name='img_info'),
+    inputs = [Input([None, 1], 'int64', name='img_id'),
+              Input([None, 2], 'int32', name='img_shape'),
              Input([None, 3, None, None], 'float32', name='image')]
    cat2name = load_labels(FLAGS.label_list, with_background=False)
@@ -87,9 +88,10 @@ def main():
    img -= np.array(IMAGE_MEAN)
    img /= np.array(IMAGE_STD)
    img = img.transpose((2, 0, 1))[np.newaxis, :]
-    img_info = np.array([0, h, w]).astype('int32')[np.newaxis, :]
+    img_id = np.array([0]).astype('int64')[np.newaxis, :]
+    img_shape = np.array([h, w]).astype('int32')[np.newaxis, :]
-    _, bboxes = model.test([img_info, img])
+    _, bboxes = model.test([img_id, img_shape, img])
    vis_img = draw_bbox(orig_img, cat2name, bboxes, FLAGS.draw_threshold)
    save_name = get_save_image_name(FLAGS.output_dir, FLAGS.infer_image)

--- a/yolov3/main.py
+++ b/yolov3/main.py
@@ -23,14 +23,15 @@ import numpy as np
 from paddle import fluid
 from paddle.fluid.optimizer import Momentum
-from paddle.fluid.io import DataLoader
+from paddle.io import DataLoader
-from model import Model, Input, set_device
+from hapi.model import Model, Input, set_device
-from distributed import DistributedBatchSampler
+from hapi.distributed import DistributedBatchSampler
-from models import yolov3_darknet53, YoloLoss
+from hapi.vision.transforms import Compose, BatchCompose
-from coco_metric import COCOMetric
+from modeling import yolov3_darknet53, YoloLoss
 from coco import COCODataset
+from coco_metric import COCOMetric
 from transforms import *
 NUM_MAX_BOXES = 50
@@ -63,7 +64,8 @@ def main():
    device = set_device(FLAGS.device)
    fluid.enable_dygraph(device) if FLAGS.dynamic else None
-    inputs = [Input([None, 3], 'int32', name='img_info'),
+    inputs = [Input([None, 1], 'int64', name='img_id'),
+              Input([None, 2], 'int32', name='img_shape'),
              Input([None, 3, None, None], 'float32', name='image')]
    labels = [Input([None, NUM_MAX_BOXES, 4], 'float32', name='gt_bbox'),
 	      Input([None, NUM_MAX_BOXES], 'int32', name='gt_label'),
@@ -123,7 +125,7 @@ def main():
                   model_mode='eval' if FLAGS.eval_only else 'train',
                   pretrained=pretrained)
-    if FLAGS.pretrain_weights is not None:
+    if FLAGS.pretrain_weights and not FLAGS.eval_only:
        model.load(FLAGS.pretrain_weights, skip_mismatch=True, reset_optimizer=True)
    optim = make_optimizer(len(batch_sampler), parameter_list=model.parameters())
@@ -163,7 +165,7 @@ def main():
              save_dir="yolo_checkpoint/mixup",
              save_freq=10)
-    # do not use image mixup transfrom in laste FLAGS.no_mixup_epoch epoches
+    # do not use image mixup transfrom in the last FLAGS.no_mixup_epoch epoches
    dataset.mixup = False
    model.fit(train_data=loader,
              epochs=FLAGS.no_mixup_epoch,

--- a/models/yolov3.py
+++ b/models/yolov3.py
@@ -16,13 +16,13 @@ from __future__ import division
 from __future__ import print_function
 import paddle.fluid as fluid
-from paddle.fluid.dygraph.nn import Conv2D
+from paddle.fluid.dygraph.nn import Conv2D, BatchNorm
 from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.regularizer import L2Decay
-from model import Model, Loss
+from hapi.model import Model, Loss
-from .darknet import darknet53, ConvBNLayer
+from hapi.download import get_weights_path
-from .download import get_weights_path
+from hapi.vision.models import darknet53
 __all__ = ['YoloLoss', 'YOLOv3', 'yolov3_darknet53']
@@ -33,6 +33,46 @@ pretrain_infos = {
 }
+class ConvBNLayer(fluid.dygraph.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size=3,
+                 stride=1,
+                 groups=1,
+                 padding=0,
+                 act="leaky"):
+        super(ConvBNLayer, self).__init__()
+        self.conv = Conv2D(
+            num_channels=ch_in,
+            num_filters=ch_out,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            param_attr=ParamAttr(
+                initializer=fluid.initializer.Normal(0., 0.02)),
+            bias_attr=False,
+            act=None)
+        self.batch_norm = BatchNorm(
+            num_channels=ch_out,
+            param_attr=ParamAttr(
+                initializer=fluid.initializer.Normal(0., 0.02),
+                regularizer=L2Decay(0.)),
+            bias_attr=ParamAttr(
+                initializer=fluid.initializer.Constant(0.0),
+                regularizer=L2Decay(0.)))
+        self.act = act
+    def forward(self, inputs):
+        out = self.conv(inputs)
+        out = self.batch_norm(out)
+        if self.act == 'leaky':
+            out = fluid.layers.leaky_relu(x=out, alpha=0.1)
+        return out
 class YoloDetectionBlock(fluid.dygraph.Layer):
    def __init__(self, ch_in, channel):
        super(YoloDetectionBlock, self).__init__()
@@ -88,6 +128,20 @@ class YoloDetectionBlock(fluid.dygraph.Layer):
 class YOLOv3(Model):
+    """YOLOv3 model from
+    `"YOLOv3: An Incremental Improvement" <https://arxiv.org/abs/1804.02767>`_
+    Args:
+        num_classes (int): class number, default 80.
+        model_mode (str): 'train', 'eval', 'test' mode, network structure
+            will be diffrent in the output layer and data, in 'train' mode,
+            no output layer append, in 'eval' and 'test', output feature
+            map will be decode to predictions by 'fluid.layers.yolo_box',
+            in 'eval' mode, return feature maps and predictions, in 'test'
+            mode, only return predictions. Default 'train'.
+    """
    def __init__(self, num_classes=80, model_mode='train'):
        super(YOLOv3, self).__init__()
        self.num_classes = num_classes
@@ -138,7 +192,7 @@ class YOLOv3(Model):
                                act='leaky_relu'))
                self.route_blocks.append(route)
-    def forward(self, img_info, inputs):
+    def forward(self, img_id, img_shape, inputs):
        outputs = []
        boxes = []
        scores = []
@@ -163,8 +217,6 @@ class YOLOv3(Model):
                for m in anchor_mask:
                    mask_anchors.append(self.anchors[2 * m])
                    mask_anchors.append(self.anchors[2 * m + 1])
-                img_shape = fluid.layers.slice(img_info, axes=[1], starts=[1], ends=[3])
-                img_id = fluid.layers.slice(img_info, axes=[1], starts=[0], ends=[1])
                b, s = fluid.layers.yolo_box(
                    x=block_out,
                    img_size=img_shape,
@@ -181,7 +233,7 @@ class YOLOv3(Model):
        if self.model_mode == 'train':
            return outputs
-        preds = [img_id[0, :],
+        preds = [img_id,
                 fluid.layers.multiclass_nms(
                    bboxes=fluid.layers.concat(boxes, axis=1),
                    scores=fluid.layers.concat(scores, axis=2),
@@ -242,9 +294,22 @@ def _yolov3_darknet(num_layers=53, num_classes=80,
        weight_path = get_weights_path(*(pretrain_infos[num_layers]))
        assert weight_path.endswith('.pdparams'), \
                "suffix of weight must be .pdparams"
-        model.load(weight_path[:-9])
+        model.load(weight_path)
    return model
 def yolov3_darknet53(num_classes=80, model_mode='train', pretrained=True):
+    """YOLOv3 model with 53-layer DarkNet as backbone
+    Args:
+        num_classes (int): class number, default 80.
+        model_mode (str): 'train', 'eval', 'test' mode, network structure
+            will be diffrent in the output layer and data, in 'train' mode,
+            no output layer append, in 'eval' and 'test', output feature
+            map will be decode to predictions by 'fluid.layers.yolo_box',
+            in 'eval' mode, return feature maps and predictions, in 'test'
+            mode, only return predictions. Default 'train'.
+        pretrained (bool): If True, returns a model with pre-trained model
+            on COCO, default True
+    """
    return _yolov3_darknet(53, num_classes, model_mode, pretrained)
--- a/yolov3/transforms.py
+++ b/yolov3/transforms.py
@@ -19,48 +19,18 @@ import cv2
 import traceback
 import numpy as np
-import logging
+__all__ = [
-logger = logging.getLogger(__name__)
+    'ColorDistort',
+    'RandomExpand',
-__all__ = ['ColorDistort', 'RandomExpand', 'RandomCrop', 'RandomFlip',
+    'RandomCrop',
-           'NormalizeBox', 'PadBox', 'RandomShape', 'NormalizeImage',
+    'RandomFlip',
-           'BboxXYXY2XYWH', 'ResizeImage', 'Compose', 'BatchCompose']
+    'NormalizeBox',
+    'PadBox',
+    'RandomShape',
-class Compose(object):
+    'NormalizeImage',
-    def __init__(self, transforms=[]):
+    'BboxXYXY2XYWH',
-        self.transforms = transforms
+    'ResizeImage',
+]
-    def __call__(self, *data):
-        for f in self.transforms:
-            try:
-                data = f(*data)
-            except Exception as e:
-                stack_info = traceback.format_exc()
-                logger.info("fail to perform transform [{}] with error: "
-                        "{} and stack:\n{}".format(f, e, str(stack_info)))
-                raise e
-        return data
-class BatchCompose(object):
-    def __init__(self, transforms=[]):
-        self.transforms = transforms
-    def __call__(self, data):
-        for f in self.transforms:
-            try:
-                data = f(data)
-            except Exception as e:
-                stack_info = traceback.format_exc()
-                logger.info("fail to perform batch transform [{}] with error: "
-                        "{} and stack:\n{}".format(f, e, str(stack_info)))
-                raise e
-        # sample list to batch data
-        batch = list(zip(*data))
-        return batch
 class ColorDistort(object):
@@ -145,7 +115,7 @@ class ColorDistort(object):
        img += delta
        return img
-    def __call__(self, im_info, im, gt_bbox, gt_class, gt_score):
+    def __call__(self, im_id, im_shape, im, gt_bbox, gt_class, gt_score):
        if self.random_apply:
            distortions = np.random.permutation([
                self.apply_brightness, self.apply_contrast,
@@ -153,7 +123,7 @@ class ColorDistort(object):
            ])
            for func in distortions:
                im = func(im)
-            return [im_info, im, gt_bbox, gt_class, gt_score]
+            return [im_id, im_shape, im, gt_bbox, gt_class, gt_score]
        im = self.apply_brightness(im)
@@ -165,7 +135,7 @@ class ColorDistort(object):
            im = self.apply_saturation(im)
            im = self.apply_hue(im)
            im = self.apply_contrast(im)
-        return [im_info, im, gt_bbox, gt_class, gt_score]
+        return [im_id, im_shape, im, gt_bbox, gt_class, gt_score]
 class RandomExpand(object):
@@ -183,16 +153,16 @@ class RandomExpand(object):
        self.prob = prob
        self.fill_value = fill_value
-    def __call__(self, im_info, im, gt_bbox, gt_class, gt_score):
+    def __call__(self, im_id, im_shape, im, gt_bbox, gt_class, gt_score):
        if np.random.uniform(0., 1.) < self.prob:
-            return [im_info, im, gt_bbox, gt_class, gt_score]
+            return [im_id, im_shape, im, gt_bbox, gt_class, gt_score]
        height, width, _ = im.shape
        expand_ratio = np.random.uniform(1., self.ratio)
        h = int(height * expand_ratio)
        w = int(width * expand_ratio)
        if not h > height or not w > width:
-            return [im_info, im, gt_bbox, gt_class, gt_score]
+            return [im_id, im_shape, im, gt_bbox, gt_class, gt_score]
        y = np.random.randint(0, h - height)
        x = np.random.randint(0, w - width)
        canvas = np.ones((h, w, 3), dtype=np.uint8)
@@ -201,7 +171,7 @@ class RandomExpand(object):
        gt_bbox += np.array([x, y, x, y], dtype=np.float32)
-        return [im_info, canvas, gt_bbox, gt_class, gt_score]
+        return [im_id, im_shape, canvas, gt_bbox, gt_class, gt_score]
 class RandomCrop():
@@ -232,9 +202,9 @@ class RandomCrop():
        self.allow_no_crop = allow_no_crop
        self.cover_all_box = cover_all_box
-    def __call__(self, im_info, im, gt_bbox, gt_class, gt_score):
+    def __call__(self, im_id, im_shape, im, gt_bbox, gt_class, gt_score):
        if len(gt_bbox) == 0:
-            return [im_info, im, gt_bbox, gt_class, gt_score]
+            return [im_id, im_shape, im, gt_bbox, gt_class, gt_score]
        # NOTE Original method attempts to generate one candidate for each
        # threshold then randomly sample one from the resulting list.
@@ -251,7 +221,7 @@ class RandomCrop():
        for thresh in thresholds:
            if thresh == 'no_crop':
-                return [im_info, im, gt_bbox, gt_class, gt_score]
+                return [im_id, im_shape, im, gt_bbox, gt_class, gt_score]
            h, w, _ = im.shape
            found = False
@@ -286,9 +256,9 @@ class RandomCrop():
                gt_bbox = np.take(cropped_box, valid_ids, axis=0)
                gt_class = np.take(gt_class, valid_ids, axis=0)
                gt_score = np.take(gt_score, valid_ids, axis=0)
-                return [im_info, im, gt_bbox, gt_class, gt_score]
+                return [im_id, im_shape, im, gt_bbox, gt_class, gt_score]
-        return [im_info, im, gt_bbox, gt_class, gt_score]
+        return [im_id, im_shape, im, gt_bbox, gt_class, gt_score]
    def _iou_matrix(self, a, b):
        tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2])
@@ -334,7 +304,7 @@ class RandomFlip():
                isinstance(self.is_normalized, bool)):
            raise TypeError("{}: input type is invalid.".format(self))
-    def __call__(self, im_info, im, gt_bbox, gt_class, gt_score):
+    def __call__(self, im_id, im_shape, im, gt_bbox, gt_class, gt_score):
        """Filp the image and bounding box.
        Operators:
            1. Flip the image numpy.
@@ -363,20 +333,20 @@ class RandomFlip():
                    m = "{}: invalid box, x2 should be greater than x1".format(
                        self)
                    raise ValueError(m)
-        return [im_info, im, gt_bbox, gt_class, gt_score]
+        return [im_id, im_shape, im, gt_bbox, gt_class, gt_score]
 class NormalizeBox(object):
    """Transform the bounding box's coornidates to [0,1]."""
-    def __call__(self, im_info, im, gt_bbox, gt_class, gt_score):
+    def __call__(self, im_id, im_shape, im, gt_bbox, gt_class, gt_score):
        height, width, _ = im.shape
        for i in range(gt_bbox.shape[0]):
            gt_bbox[i][0] = gt_bbox[i][0] / width
            gt_bbox[i][1] = gt_bbox[i][1] / height
            gt_bbox[i][2] = gt_bbox[i][2] / width
            gt_bbox[i][3] = gt_bbox[i][3] / height
-        return [im_info, im, gt_bbox, gt_class, gt_score]
+        return [im_id, im_shape, im, gt_bbox, gt_class, gt_score]
 class PadBox(object):
@@ -388,7 +358,7 @@ class PadBox(object):
        """
        self.num_max_boxes = num_max_boxes
-    def __call__(self, im_info, im, gt_bbox, gt_class, gt_score):
+    def __call__(self, im_id, im_shape, im, gt_bbox, gt_class, gt_score):
        gt_num = min(self.num_max_boxes, len(gt_bbox))
        num_max = self.num_max_boxes
@@ -406,7 +376,7 @@ class PadBox(object):
        if gt_num > 0:
            pad_score[:gt_num] = gt_score[:gt_num, 0]
        gt_score = pad_score
-        return [im_info, im, gt_bbox, gt_class, gt_score]
+        return [im_id, im_shape, im, gt_bbox, gt_class, gt_score]
 class BboxXYXY2XYWH(object):
@@ -414,10 +384,10 @@ class BboxXYXY2XYWH(object):
    Convert bbox XYXY format to XYWH format.
    """
-    def __call__(self, im_info, im, gt_bbox, gt_class, gt_score):
+    def __call__(self, im_id, im_shape, im, gt_bbox, gt_class, gt_score):
        gt_bbox[:, 2:4] = gt_bbox[:, 2:4] - gt_bbox[:, :2]
        gt_bbox[:, :2] = gt_bbox[:, :2] + gt_bbox[:, 2:4] / 2.
-        return [im_info, im, gt_bbox, gt_class, gt_score]
+        return [im_id, im_shape, im, gt_bbox, gt_class, gt_score]
 class RandomShape(object):
@@ -450,13 +420,13 @@ class RandomShape(object):
        method = np.random.choice(self.interps) if self.random_inter \
            else cv2.INTER_NEAREST
        for i in range(len(samples)):
-            im = samples[i][1]
+            im = samples[i][2]
            h, w = im.shape[:2]
            scale_x = float(shape) / w
            scale_y = float(shape) / h
            im = cv2.resize(
                im, None, None, fx=scale_x, fy=scale_y, interpolation=method)
-            samples[i][1] = im
+            samples[i][2] = im
        return samples
@@ -492,7 +462,7 @@ class NormalizeImage(object):
            3. (optional) permute channel
        """
        for i in range(len(samples)):
-            im = samples[i][1]
+            im = samples[i][2]
            im = im.astype(np.float32, copy=False)
            mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
            std = np.array(self.std)[np.newaxis, np.newaxis, :]
@@ -502,7 +472,7 @@ class NormalizeImage(object):
            im /= std
            if self.channel_first:
                im = im.transpose((2, 0, 1))
-            samples[i][1] = im
+            samples[i][2] = im
        return samples
@@ -595,16 +565,15 @@ class ResizeImage(object):
                format(type(target_size)))
        self.target_size = target_size
-    def __call__(self, im_info, im, gt_bbox, gt_class, gt_score):
+    def __call__(self, im_id, im_shape, im, gt_bbox, gt_class, gt_score):
        """ Resize the image numpy.
        """
        if not isinstance(im, np.ndarray):
            raise TypeError("{}: image type is not numpy.".format(self))
        if len(im.shape) != 3:
            raise ImageError('{}: image is not 3-dimensional.'.format(self))
-        im_shape = im.shape
+        im_scale_x = float(self.target_size) / float(im.shape[1])
-        im_scale_x = float(self.target_size) / float(im_shape[1])
+        im_scale_y = float(self.target_size) / float(im.shape[0])
-        im_scale_y = float(self.target_size) / float(im_shape[0])
        resize_w = self.target_size 
        resize_h = self.target_size 
@@ -616,5 +585,5 @@ class ResizeImage(object):
            fy=im_scale_y,
            interpolation=self.interp)
-        return [im_info, im, gt_bbox, gt_class, gt_score]
+        return [im_id, im_shape, im, gt_bbox, gt_class, gt_score]
--- a/yolov3/visualizer.py
+++ b/yolov3/visualizer.py
--- a/hapi/__init__.py
+++ b/hapi/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from hapi.configure import Config
+from hapi import callbacks
+from hapi import datasets
+from hapi import distributed
+from hapi import download
+from hapi import metrics
+from hapi import model
+from hapi import progressbar
+from hapi import text
+from hapi import vision
+__all__ = [
+    'Config',
+    'callbacks',
+    'datasets',
+    'distributed',
+    'download',
+    'metrics',
+    'model',
+    'progressbar',
+    'text',
+    'vision',
+]
--- a/callbacks.py
+++ b/callbacks.py
@@ -15,7 +15,7 @@
 import six
 import copy
-from progressbar import ProgressBar
+from .progressbar import ProgressBar
 from paddle.fluid.dygraph.parallel import ParallelEnv
@@ -218,8 +218,6 @@ class ProgBarLogger(Callback):
            # if steps is not None, last step will update in on_epoch_end
            if self.steps and self.train_step < self.steps:
                self._updates(logs, 'train')
-            else:
-                self._updates(logs, 'train')
    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
@@ -238,7 +236,7 @@ class ProgBarLogger(Callback):
    def on_eval_batch_end(self, step, logs=None):
        logs = logs or {}
-        self.eval_step = step
+        self.eval_step += 1
        samples = logs.get('batch_size', 1)
        self.evaled_samples += samples

--- a/hapi/configure.py
+++ b/hapi/configure.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import sys
+import argparse
+import json
+import yaml
+import six
+import logging
+logging_only_message = "%(message)s"
+logging_details = "%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s"
+class JsonConfig(object):
+    """
+    A high-level api for handling json configure file.
+    """
+    def __init__(self, config_path):
+        self._config_dict = self._parse(config_path)
+    def _parse(self, config_path):
+        try:
+            with open(config_path) as json_file:
+                config_dict = json.load(json_file)
+        except:
+            raise IOError("Error in parsing bert model config file '%s'" %
+                          config_path)
+        else:
+            return config_dict
+    def __getitem__(self, key):
+        return self._config_dict[key]
+    def print_config(self):
+        for arg, value in sorted(six.iteritems(self._config_dict)):
+            print('%s: %s' % (arg, value))
+        print('------------------------------------------------')
+class ArgumentGroup(object):
+    def __init__(self, parser, title, des):
+        self._group = parser.add_argument_group(title=title, description=des)
+    def add_arg(self, name, type, default, help, **kwargs):
+        type = str2bool if type == bool else type
+        self._group.add_argument(
+            "--" + name,
+            default=default,
+            type=type,
+            help=help + ' Default: %(default)s.',
+            **kwargs)
+class ArgConfig(object):
+    """
+    A high-level api for handling argument configs.
+    """
+    def __init__(self):
+        parser = argparse.ArgumentParser()
+        custom_g = ArgumentGroup(parser, "customize", "customized options.")
+        self.custom_g = custom_g
+        self.parser = parser
+    def add_arg(self, name, dtype, default, descrip):
+        self.custom_g.add_arg(name, dtype, default, descrip)
+    def build_conf(self):
+        return self.parser.parse_args()
+def str2bool(v):
+    # because argparse does not support to parse "true, False" as python
+    # boolean directly
+    return v.lower() in ("true", "t", "1")
+def print_arguments(args, log=None):
+    if not log:
+        print('-----------  Configuration Arguments -----------')
+        for arg, value in sorted(six.iteritems(vars(args))):
+            print('%s: %s' % (arg, value))
+        print('------------------------------------------------')
+    else:
+        log.info('-----------  Configuration Arguments -----------')
+        for arg, value in sorted(six.iteritems(vars(args))):
+            log.info('%s: %s' % (arg, value))
+        log.info('------------------------------------------------')
+class Config(object):
+    """
+    A high-level API for managing configuration files in PaddlePaddle.
+    Can jointly work with command-line-arugment, json files and yaml files.
+    """
+    def __init__(self, json_file="", yaml_file="", fuse_args=True):
+        """
+            Init funciton for PDConfig.
+            json_file: the path to the json configure file.
+            yaml_file: the path to the yaml configure file.
+            fuse_args: if fuse the json/yaml configs with argparse.
+        """
+        assert isinstance(json_file, str)
+        assert isinstance(yaml_file, str)
+        if json_file != "" and yaml_file != "":
+            raise Warning(
+                "json_file and yaml_file can not co-exist for now. please only use one configure file type."
+            )
+            return
+        self.args = None
+        self.arg_config = {}
+        self.json_config = {}
+        self.yaml_config = {}
+        parser = argparse.ArgumentParser()
+        self.default_g = ArgumentGroup(parser, "default", "default options.")
+        self.yaml_g = ArgumentGroup(parser, "yaml", "options from yaml.")
+        self.json_g = ArgumentGroup(parser, "json", "options from json.")
+        self.com_g = ArgumentGroup(parser, "custom", "customized options.")
+        self.parser = parser
+        if json_file != "":
+            self.load_json(json_file, fuse_args=fuse_args)
+        if yaml_file:
+            self.load_yaml(yaml_file, fuse_args=fuse_args)
+    def load_json(self, file_path, fuse_args=True):
+        if not os.path.exists(file_path):
+            raise Warning("the json file %s does not exist." % file_path)
+            return
+        with open(file_path, "r") as fin:
+            self.json_config = json.loads(fin.read())
+            fin.close()
+        if fuse_args:
+            for name in self.json_config:
+                if isinstance(self.json_config[name], list):
+                    self.json_g.add_arg(
+                        name,
+                        type(self.json_config[name][0]),
+                        self.json_config[name],
+                        "This is from %s" % file_path,
+                        nargs=len(self.json_config[name]))
+                    continue
+                if not isinstance(self.json_config[name], int) \
+                    and not isinstance(self.json_config[name], float) \
+                    and not isinstance(self.json_config[name], str) \
+                    and not isinstance(self.json_config[name], bool):
+                    continue
+                self.json_g.add_arg(name,
+                                    type(self.json_config[name]),
+                                    self.json_config[name],
+                                    "This is from %s" % file_path)
+    def load_yaml(self, file_path, fuse_args=True):
+        if not os.path.exists(file_path):
+            raise Warning("the yaml file %s does not exist." % file_path)
+            return
+        with open(file_path, "r") as fin:
+            self.yaml_config = yaml.load(fin, Loader=yaml.SafeLoader)
+            fin.close()
+        if fuse_args:
+            for name in self.yaml_config:
+                if isinstance(self.yaml_config[name], list):
+                    self.yaml_g.add_arg(
+                        name,
+                        type(self.yaml_config[name][0]),
+                        self.yaml_config[name],
+                        "This is from %s" % file_path,
+                        nargs=len(self.yaml_config[name]))
+                    continue
+                if not isinstance(self.yaml_config[name], int) \
+                    and not isinstance(self.yaml_config[name], float) \
+                    and not isinstance(self.yaml_config[name], str) \
+                    and not isinstance(self.yaml_config[name], bool):
+                    continue
+                self.yaml_g.add_arg(name,
+                                    type(self.yaml_config[name]),
+                                    self.yaml_config[name],
+                                    "This is from %s" % file_path)
+    def build(self):
+        self.args = self.parser.parse_args()
+        self.arg_config = vars(self.args)
+    def __add__(self, new_arg):
+        assert isinstance(new_arg, list) or isinstance(new_arg, tuple)
+        assert len(new_arg) >= 3
+        assert self.args is None
+        name = new_arg[0]
+        dtype = new_arg[1]
+        dvalue = new_arg[2]
+        desc = new_arg[3] if len(
+            new_arg) == 4 else "Description is not provided."
+        self.com_g.add_arg(name, dtype, dvalue, desc)
+        return self
+    def __getattr__(self, name):
+        if name in self.arg_config:
+            return self.arg_config[name]
+        if name in self.json_config:
+            return self.json_config[name]
+        if name in self.yaml_config:
+            return self.yaml_config[name]
+        raise Warning("The argument %s is not defined." % name)
+    def Print(self):
+        print("-" * 70)
+        for name in self.arg_config:
+            print("%s:\t\t\t\t%s" % (str(name), str(self.arg_config[name])))
+        for name in self.json_config:
+            if name not in self.arg_config:
+                print("%s:\t\t\t\t%s" %
+                      (str(name), str(self.json_config[name])))
+        for name in self.yaml_config:
+            if name not in self.arg_config:
+                print("%s:\t\t\t\t%s" %
+                      (str(name), str(self.yaml_config[name])))
+        print("-" * 70)
+if __name__ == "__main__":
+    """
+    pd_config = PDConfig(json_file = "./test/bert_config.json")
+    pd_config.build()
+    print(pd_config.do_train)
+    print(pd_config.hidden_size)
+    pd_config = PDConfig(yaml_file = "./test/bert_config.yaml")
+    pd_config.build()
+    print(pd_config.do_train)
+    print(pd_config.hidden_size)
+    """
+    config = Config(yaml_file="./bert.yaml")
+    config += ("my_age", int, 18, "I am forever 18.")
+    config.build()
+    print(config.data_dir)
+    print(config.my_age)
--- a/hapi/datasets/__init__.py
+++ b/hapi/datasets/__init__.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from . import folder
+from . import mnist
+from . import flowers
+from .folder import *
+from .mnist import *
+from .flowers import *
+__all__ = folder.__all__ \
+        + mnist.__all__ \
+        + flowers.__all__
--- a/hapi/datasets/flowers.py
+++ b/hapi/datasets/flowers.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import os
+import io
+import tarfile
+import numpy as np
+import scipy.io as scio
+from PIL import Image
+from paddle.io import Dataset
+from .utils import _check_exists_and_download
+__all__ = ["Flowers"]
+DATA_URL = 'http://paddlemodels.bj.bcebos.com/flowers/102flowers.tgz'
+LABEL_URL = 'http://paddlemodels.bj.bcebos.com/flowers/imagelabels.mat'
+SETID_URL = 'http://paddlemodels.bj.bcebos.com/flowers/setid.mat'
+DATA_MD5 = '52808999861908f626f3c1f4e79d11fa'
+LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
+SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
+# In official 'readme', tstid is the flag of test data
+# and trnid is the flag of train data. But test data is more than train data.
+# So we exchange the train data and test data.
+MODE_FLAG_MAP = {'train': 'tstid', 'test': 'trnid', 'valid': "valid"}
+class Flowers(Dataset):
+    """
+    Implement of flowers dataset
+    Args:
+        data_file(str): path to data file, can be set None if
+            :attr:`download` is True. Default None
+        label_file(str): path to label file, can be set None if
+            :attr:`download` is True. Default None
+        setid_file(str): path to subset index file, can be set
+            None if :attr:`download` is True. Default None
+        mode(str): 'train', 'valid' or 'test' mode. Default 'train'.
+        download(bool): whether auto download mnist dataset if
+            :attr:`image_path`/:attr:`label_path` unset. Default
+            True
+    Examples:
+        .. code-block:: python
+            from hapi.vision.datasets import Flowers
+            flowers = Flowers(mode='test')
+            for i in range(len(flowers)):
+                sample = flowers[i]
+                print(sample[0].shape, sample[1])
+    """
+    def __init__(self,
+                 data_file=None,
+                 label_file=None,
+                 setid_file=None,
+                 mode='train',
+                 transform=None,
+                 download=True):
+        assert mode.lower() in ['train', 'valid', 'test'], \
+                "mode should be 'train', 'valid' or 'test', but got {}".format(mode)
+        self.flag = MODE_FLAG_MAP[mode.lower()]
+        self.data_file = data_file
+        if self.data_file is None:
+            assert download, "data_file not set and auto download disabled"
+            self.data_file = _check_exists_and_download(
+                data_file, DATA_URL, DATA_MD5, 'flowers', download)
+        self.label_file = label_file
+        if self.label_file is None:
+            assert download, "label_file not set and auto download disabled"
+            self.label_file = _check_exists_and_download(
+                label_file, LABEL_URL, LABEL_MD5, 'flowers', download)
+        self.setid_file = setid_file
+        if self.setid_file is None:
+            assert download, "setid_file not set and auto download disabled"
+            self.setid_file = _check_exists_and_download(
+                setid_file, SETID_URL, SETID_MD5, 'flowers', download)
+        self.transform = transform
+        # read dataset into memory
+        self._load_anno()
+    def _load_anno(self):
+        self.name2mem = {}
+        self.data_tar = tarfile.open(self.data_file)
+        for ele in self.data_tar.getmembers():
+            self.name2mem[ele.name] = ele
+        self.labels = scio.loadmat(self.label_file)['labels'][0]
+        self.indexes = scio.loadmat(self.setid_file)[self.flag][0]
+    def __getitem__(self, idx):
+        index = self.indexes[idx]
+        label = np.array([self.labels[index - 1]])
+        img_name = "jpg/image_%05d.jpg" % index
+        img_ele = self.name2mem[img_name]
+        image = self.data_tar.extractfile(img_ele).read()
+        image = np.array(Image.open(io.BytesIO(image)))
+        if self.transform is not None:
+            image, label = self.transform(image, label)
+        return image, label
+    def __len__(self):
+        return len(self.indexes)
--- a/datasets/folder.py
+++ b/datasets/folder.py
@@ -16,7 +16,9 @@ import os
 import sys
 import cv2
-from paddle.fluid.io import Dataset
+from paddle.io import Dataset
+__all__ = ["DatasetFolder", "ImageFolder"]
 def has_valid_extension(filename, extensions):
@@ -71,14 +73,12 @@ class DatasetFolder(Dataset):
    Args:
        root (string): Root directory path.
-        loader (callable, optional): A function to load a sample given its path.
+        loader (callable|optional): A function to load a sample given its path.
-        extensions (tuple[string], optional): A list of allowed extensions.
+        extensions (tuple[str]|optional): A list of allowed extensions.
            both extensions and is_valid_file should not be passed.
-        transform (callable, optional): A function/transform that takes in
+        transform (callable|optional): A function/transform that takes in
            a sample and returns a transformed version.
-        target_transform (callable, optional): A function/transform that takes
+        is_valid_file (callable|optional): A function that takes path of a file
-            in the target and transforms it.
-        is_valid_file (callable, optional): A function that takes path of a file
            and check if the file is a valid file (used to check of corrupt files)
            both extensions and is_valid_file should not be passed.
@@ -94,9 +94,9 @@ class DatasetFolder(Dataset):
                 loader=None,
                 extensions=None,
                 transform=None,
-                 target_transform=None,
                 is_valid_file=None):
        self.root = root
+        self.transform = transform
        if extensions is None:
            extensions = IMG_EXTENSIONS
        classes, class_to_idx = self._find_classes(self.root)
@@ -150,9 +150,7 @@ class DatasetFolder(Dataset):
        path, target = self.samples[index]
        sample = self.loader(path)
        if self.transform is not None:
-            sample = self.transform(sample)
+            sample, target = self.transform(sample, target)
-        if self.target_transform is not None:
-            target = self.target_transform(target)
        return sample, target
@@ -166,3 +164,80 @@ IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif',
 def cv2_loader(path):
    return cv2.imread(path)
+class ImageFolder(Dataset):
+    """A generic data loader where the samples are arranged in this way:
+        root/1.ext
+        root/2.ext
+        root/sub_dir/3.ext
+    Args:
+        root (string): Root directory path.
+        loader (callable, optional): A function to load a sample given its path.
+        extensions (tuple[string], optional): A list of allowed extensions.
+            both extensions and is_valid_file should not be passed.
+        transform (callable, optional): A function/transform that takes in
+            a sample and returns a transformed version.
+        is_valid_file (callable, optional): A function that takes path of a file
+            and check if the file is a valid file (used to check of corrupt files)
+            both extensions and is_valid_file should not be passed.
+     Attributes:
+        samples (list): List of sample path
+     """
+    def __init__(self,
+                 root,
+                 loader=None,
+                 extensions=None,
+                 transform=None,
+                 is_valid_file=None):
+        self.root = root
+        if extensions is None:
+            extensions = IMG_EXTENSIONS
+        samples = []
+        path = os.path.expanduser(root)
+        if not ((extensions is None) ^ (is_valid_file is None)):
+            raise ValueError(
+                "Both extensions and is_valid_file cannot be None or not None at the same time"
+            )
+        if extensions is not None:
+            def is_valid_file(x):
+                return has_valid_extension(x, extensions)
+        for root, _, fnames in sorted(os.walk(path, followlinks=True)):
+            for fname in sorted(fnames):
+                f = os.path.join(root, fname)
+                if is_valid_file(f):
+                    samples.append(f)
+        if len(samples) == 0:
+            raise (RuntimeError(
+                "Found 0 files in subfolders of: " + self.root + "\n"
+                "Supported extensions are: " + ",".join(extensions)))
+        self.loader = cv2_loader if loader is None else loader
+        self.extensions = extensions
+        self.samples = samples
+        self.transform = transform
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+        Returns:
+            tuple: (sample, target) where target is class_index of the target class.
+        """
+        path = self.samples[index]
+        sample = self.loader(path)
+        if self.transform is not None:
+            sample = self.transform(sample)
+        return [sample]
+    def __len__(self):
+        return len(self.samples)
--- a/hapi/datasets/mnist.py
+++ b/hapi/datasets/mnist.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import os
+import gzip
+import struct
+import numpy as np
+import paddle.dataset.common
+from paddle.io import Dataset
+from .utils import _check_exists_and_download
+__all__ = ["MNIST"]
+URL_PREFIX = 'https://dataset.bj.bcebos.com/mnist/'
+TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz'
+TEST_IMAGE_MD5 = '9fb629c4189551a2d022fa330f9573f3'
+TEST_LABEL_URL = URL_PREFIX + 't10k-labels-idx1-ubyte.gz'
+TEST_LABEL_MD5 = 'ec29112dd5afa0611ce80d1b7f02629c'
+TRAIN_IMAGE_URL = URL_PREFIX + 'train-images-idx3-ubyte.gz'
+TRAIN_IMAGE_MD5 = 'f68b3c2dcbeaaa9fbdd348bbdeb94873'
+TRAIN_LABEL_URL = URL_PREFIX + 'train-labels-idx1-ubyte.gz'
+TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432'
+class MNIST(Dataset):
+    """
+    Implement of MNIST dataset
+    Args:
+        image_path(str): path to image file, can be set None if
+            :attr:`download` is True. Default None
+        label_path(str): path to label file, can be set None if
+            :attr:`download` is True. Default None
+        mode(str): 'train' or 'test' mode. Default 'train'.
+        download(bool): whether auto download mnist dataset if
+            :attr:`image_path`/:attr:`label_path` unset. Default
+            True
+    Returns:
+        Dataset: MNIST Dataset.
+    Examples:
+        .. code-block:: python
+            from hapi.vision.datasets import MNIST
+            mnist = MNIST(mode='test')
+            for i in range(len(mnist)):
+                sample = mnist[i]
+                print(sample[0].shape, sample[1])
+    """
+    def __init__(self,
+                 image_path=None,
+                 label_path=None,
+                 mode='train',
+                 transform=None,
+                 download=True):
+        assert mode.lower() in ['train', 'test'], \
+                "mode should be 'train' or 'test', but got {}".format(mode)
+        self.mode = mode.lower()
+        self.image_path = image_path
+        if self.image_path is None:
+            assert download, "image_path not set and auto download disabled"
+            image_url = TRAIN_IMAGE_URL if mode == 'train' else TEST_IMAGE_URL
+            image_md5 = TRAIN_IMAGE_MD5 if mode == 'train' else TEST_IMAGE_MD5
+            self.image_path = _check_exists_and_download(
+                image_path, image_url, image_md5, 'mnist', download)
+        self.label_path = label_path
+        if self.label_path is None:
+            assert download, "label_path not set and auto download disabled"
+            label_url = TRAIN_LABEL_URL if mode == 'train' else TEST_LABEL_URL
+            label_md5 = TRAIN_LABEL_MD5 if mode == 'train' else TEST_LABEL_MD5
+            self.label_path = _check_exists_and_download(
+                label_path, label_url, label_md5, 'mnist', download)
+        self.transform = transform
+        # read dataset into memory
+        self._parse_dataset()
+    def _parse_dataset(self, buffer_size=100):
+        self.images = []
+        self.labels = []
+        with gzip.GzipFile(self.image_path, 'rb') as image_file:
+            img_buf = image_file.read()
+            with gzip.GzipFile(self.label_path, 'rb') as label_file:
+                lab_buf = label_file.read()
+                step_label = 0
+                offset_img = 0
+                # read from Big-endian
+                # get file info from magic byte
+                # image file : 16B
+                magic_byte_img = '>IIII'
+                magic_img, image_num, rows, cols = struct.unpack_from(
+                    magic_byte_img, img_buf, offset_img)
+                offset_img += struct.calcsize(magic_byte_img)
+                offset_lab = 0
+                # label file : 8B
+                magic_byte_lab = '>II'
+                magic_lab, label_num = struct.unpack_from(magic_byte_lab,
+                                                          lab_buf, offset_lab)
+                offset_lab += struct.calcsize(magic_byte_lab)
+                while True:
+                    if step_label >= label_num:
+                        break
+                    fmt_label = '>' + str(buffer_size) + 'B'
+                    labels = struct.unpack_from(fmt_label, lab_buf, offset_lab)
+                    offset_lab += struct.calcsize(fmt_label)
+                    step_label += buffer_size
+                    fmt_images = '>' + str(buffer_size * rows * cols) + 'B'
+                    images_temp = struct.unpack_from(fmt_images, img_buf,
+                                                     offset_img)
+                    images = np.reshape(images_temp, (buffer_size, rows *
+                                                      cols)).astype('float32')
+                    offset_img += struct.calcsize(fmt_images)
+                    images = images / 255.0
+                    images = images * 2.0
+                    images = images - 1.0
+                    for i in range(buffer_size):
+                        self.images.append(images[i, :])
+                        self.labels.append(np.array([labels[i]]))
+    def __getitem__(self, idx):
+        image, label = self.images[idx], self.labels[idx]
+        if self.transform is not None:
+            image, label = self.transform(image, label)
+        return image, label
+    def __len__(self):
+        return len(self.labels)
--- a/hapi/datasets/utils.py
+++ b/hapi/datasets/utils.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import os
+import paddle.dataset.common
+def _check_exists_and_download(path, url, md5, module_name, download=True):
+    if path and os.path.exists(path):
+        return path
+    if download:
+        return paddle.dataset.common.download(url, module_name, md5)
+    else:
+        raise FileNotFoundError(
+            '{} not exists and auto download disabled'.format(path))
--- a/distributed.py
+++ b/distributed.py
@@ -23,7 +23,7 @@ import numpy as np
 from paddle import fluid
 from paddle.fluid.layers import collective
 from paddle.fluid.dygraph.parallel import ParallelEnv, ParallelStrategy
-from paddle.fluid.io import BatchSampler
+from paddle.io import BatchSampler
 _parallel_context_initialized = False
@@ -39,7 +39,7 @@ class DistributedBatchSampler(BatchSampler):
        Dataset is assumed to be of constant size.
    Args:
-        data_source: this could be a `fluid.io.Dataset` implement
+        data_source: this could be a `paddle.io.Dataset` implement
                     or other python object which implemented
                     `__len__` for BatchSampler to get sample
                     number of data source.

--- a/models/download.py
+++ b/models/download.py
@@ -29,13 +29,22 @@ from paddle.fluid.dygraph.parallel import ParallelEnv
 import logging
 logger = logging.getLogger(__name__)
-__all__ = ['get_weights_path']
+__all__ = ['get_weights_path', 'is_url']
 WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/hapi/weights")
 DOWNLOAD_RETRY_LIMIT = 3
+def is_url(path):
+    """
+    Whether path is URL.
+    Args:
+        path (string): URL string or not.
+    """
+    return path.startswith('http://') or path.startswith('https://')
 def get_weights_path(url, md5sum=None):
    """Get weights path from WEIGHT_HOME, if not exists,
    download it from url.
@@ -62,6 +71,7 @@ def get_path(url, root_dir, md5sum=None, check_exist=True):
                    WEIGHTS_HOME or DATASET_HOME
    md5sum (str): md5 sum of download package
    """
+    assert is_url(url), "downloading from {} not a url".format(url)
    # parse path after download to decompress under root_dir
    fullpath = map_path(url, root_dir)

--- a/metrics.py
+++ b/metrics.py
@@ -48,9 +48,16 @@ class Metric(object):
                                  format(self.__class__.__name__))
    @abc.abstractmethod
-    def update(self, *args, **kwargs):
+    def update(self, *args):
        """
        Update states for metric
+        Inputs of :code:`update` is the outputs of :code:`Metric.add_metric_op`,
+        if :code:`add_metric_op` is not defined, the inputs of :code:`update`
+        will be flatten arguments of **output** of mode and **label** from data:
+        :code:`update(output1, output2, ..., label1, label2,...)`
+        see :code:`Metric.add_metric_op`
        """
        raise NotImplementedError("function 'update' not implemented in {}.".
                                  format(self.__class__.__name__))
@@ -72,11 +79,26 @@ class Metric(object):
        raise NotImplementedError("function 'name' not implemented in {}.".
                                  format(self.__class__.__name__))
-    def add_metric_op(self, pred, label):
+    def add_metric_op(self, *args):
        """
-        Add process op for metric in program
+        This API is advanced usage to accelerate metric calculating, calulations
+        from outputs of model to the states which should be updated by Metric can
+        be defined here, where Paddle OPs is also supported. Outputs of this API
+        will be the inputs of "Metric.update".
+        If :code:`add_metric_op` is defined, it will be called with **outputs**
+        of model and **labels** from data as arguments, all outputs and labels
+        will be concatenated and flatten and each filed as a separate argument
+        as follows:
+        :code:`add_metric_op(output1, output2, ..., label1, label2,...)`
+        If :code:`add_metric_op` is not defined, default behaviour is to pass
+        input to output, so output format will be:
+        :code:`return output1, output2, ..., label1, label2,...`
+        see :code:`Metric.update`
        """
-        return pred, label
+        return args
 class Accuracy(Metric):
@@ -91,12 +113,12 @@ class Accuracy(Metric):
        self._init_name(name)
        self.reset()
-    def add_metric_op(self, pred, label, *args, **kwargs):
+    def add_metric_op(self, pred, label, *args):
-        pred = fluid.layers.argsort(pred[0], descending=True)[1][:, :self.maxk]
+        pred = fluid.layers.argsort(pred, descending=True)[1][:, :self.maxk]
-        correct = pred == label[0]
+        correct = pred == label
        return correct
-    def update(self, correct, *args, **kwargs):
+    def update(self, correct, *args):
        accs = []
        for i, k in enumerate(self.topk):
            num_corrects = correct[:, :k].sum()

--- a/model.py
+++ b/model.py
@@ -32,13 +32,16 @@ from paddle.fluid.dygraph.parallel import ParallelEnv
 from paddle.fluid.layers.utils import flatten
 from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
 from paddle.fluid.incubate.fleet.base import role_maker
-from paddle.fluid.io import DataLoader, Dataset
+from paddle.io import DataLoader, Dataset
-from distributed import DistributedBatchSampler, _all_gather, prepare_distributed_context, _parallel_context_initialized
+from hapi.distributed import DistributedBatchSampler, _all_gather, prepare_distributed_context, _parallel_context_initialized
-from metrics import Metric
+from hapi.metrics import Metric
-from callbacks import config_callbacks
+from hapi.callbacks import config_callbacks
-__all__ = ['Model', 'Loss', 'CrossEntropy', 'Input', 'set_device']
+__all__ = [
+    'Model', 'Loss', 'CrossEntropy', 'Input', 'set_device',
+    'SoftmaxWithCrossEntropy'
+]
 def set_device(device):
@@ -64,7 +67,7 @@ def to_list(value):
    if value is None:
        return value
    if isinstance(value, (list, tuple)):
-        return value
+        return list(value)
    return [value]
@@ -144,6 +147,17 @@ class CrossEntropy(Loss):
        ]
+class SoftmaxWithCrossEntropy(Loss):
+    def __init__(self, average=True):
+        super(SoftmaxWithCrossEntropy, self).__init__()
+    def forward(self, outputs, labels):
+        return [
+            fluid.layers.softmax_with_cross_entropy(
+                o, l, return_softmax=False) for o, l in zip(outputs, labels)
+        ]
 class StaticGraphAdapter(object):
    def __init__(self, model):
        super(StaticGraphAdapter, self).__init__()
@@ -179,17 +193,17 @@ class StaticGraphAdapter(object):
    def mode(self, value):
        self.model.mode = value
-    def train(self, inputs, labels=None):
+    def train_batch(self, inputs, labels=None):
        assert self.model._optimizer, \
            "model not ready, please call `model.prepare()` first"
        self.mode = 'train'
        return self._run(inputs, labels)
-    def eval(self, inputs, labels=None):
+    def eval_batch(self, inputs, labels=None):
        self.mode = 'eval'
        return self._run(inputs, labels)
-    def test(self, inputs):
+    def test_batch(self, inputs):
        self.mode = 'test'
        return self._run(inputs, None)
@@ -360,10 +374,27 @@ class StaticGraphAdapter(object):
            metric_list, metric_splits = flatten_list(endpoints['metric'])
            fetch_list = endpoints['loss'] + metric_list
            num_loss = len(endpoints['loss'])
+        # if fetch Variable is same as input Variable, do not fetch
+        # from program, get it from input directly
+        pruned_fetch_list = []
+        pruned_fetch_idx_name_map = [""] * len(fetch_list)
+        for i, fetch_var in enumerate(fetch_list):
+            if fetch_var.name in feed.keys():
+                pruned_fetch_idx_name_map[i] = fetch_var.name
+            else:
+                pruned_fetch_list.append(fetch_var)
        rets = self._executor.run(compiled_prog,
                                  feed=feed,
-                                  fetch_list=fetch_list,
+                                  fetch_list=pruned_fetch_list,
                                  return_numpy=False)
+        # restore pruned fetch_list Variable from feeds
+        for i, name in enumerate(pruned_fetch_idx_name_map):
+            if len(name) > 0:
+                rets.insert(i, feed[name])
        # LoDTensor cannot be fetch as numpy directly
        rets = [np.array(v) for v in rets]
        if self.mode == 'test':
@@ -442,7 +473,7 @@ class StaticGraphAdapter(object):
            if mode != 'test':
                for metric in self.model._metrics:
                    metrics.append(
-                        to_list(metric.add_metric_op(outputs, labels)))
+                        to_list(metric.add_metric_op(*(outputs + labels))))
            if mode == 'train' and self.model._optimizer:
                self._loss_endpoint = fluid.layers.sum(losses)
@@ -536,7 +567,7 @@ class DynamicGraphAdapter(object):
        self.model.mode = value
    # TODO multi device in dygraph mode not implemented at present time
-    def train(self, inputs, labels=None):
+    def train_batch(self, inputs, labels=None):
        assert self.model._optimizer, \
            "model not ready, please call `model.prepare()` first"
        super(Model, self.model).train()
@@ -562,14 +593,14 @@ class DynamicGraphAdapter(object):
        metrics = []
        for metric in self.model._metrics:
            metric_outs = metric.add_metric_op(
-                to_list(outputs), to_list(labels))
+                *(to_list(outputs) + to_list(labels)))
            m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)])
            metrics.append(m)
        return ([to_numpy(l) for l in losses], metrics) \
            if len(metrics) > 0 else [to_numpy(l) for l in losses]
-    def eval(self, inputs, labels=None):
+    def eval_batch(self, inputs, labels=None):
        super(Model, self.model).eval()
        self.mode = 'eval'
        inputs = to_list(inputs)
@@ -601,7 +632,8 @@ class DynamicGraphAdapter(object):
                    self._merge_count[self.mode + '_total'] += samples
                    self._merge_count[self.mode + '_batch'] = samples
-            metric_outs = metric.add_metric_op(to_list(outputs), labels)
+            metric_outs = metric.add_metric_op(
+                *(to_list(outputs) + to_list(labels)))
            m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)])
            metrics.append(m)
@@ -610,7 +642,7 @@ class DynamicGraphAdapter(object):
        return ([to_numpy(l) for l in losses], metrics) \
            if len(metrics) > 0 else [to_numpy(l) for l in losses]
-    def test(self, inputs):
+    def test_batch(self, inputs):
        super(Model, self.model).eval()
        self.mode = 'test'
        inputs = [to_variable(x) for x in to_list(inputs)]
@@ -709,14 +741,14 @@ class Model(fluid.dygraph.Layer):
        else:
            self._adapter = StaticGraphAdapter(self)
-    def train(self, *args, **kwargs):
+    def train_batch(self, *args, **kwargs):
-        return self._adapter.train(*args, **kwargs)
+        return self._adapter.train_batch(*args, **kwargs)
-    def eval(self, *args, **kwargs):
+    def eval_batch(self, *args, **kwargs):
-        return self._adapter.eval(*args, **kwargs)
+        return self._adapter.eval_batch(*args, **kwargs)
-    def test(self, *args, **kwargs):
+    def test_batch(self, *args, **kwargs):
-        return self._adapter.test(*args, **kwargs)
+        return self._adapter.test_batch(*args, **kwargs)
    def save(self, *args, **kwargs):
        if ParallelEnv().local_rank == 0:
@@ -767,6 +799,13 @@ class Model(fluid.dygraph.Layer):
                    format(key, list(state.shape), list(param.shape)))
            return param, state
+        def _strip_postfix(path):
+            path, ext = os.path.splitext(path)
+            assert ext in ['', '.pdparams', '.pdopt', '.pdmodel'], \
+                    "Unknown postfix {} from weights".format(ext)
+            return path
+        path = _strip_postfix(path)
        param_state = _load_state_from_path(path + ".pdparams")
        assert param_state, "Failed to load parameters, please check path."
@@ -777,7 +816,7 @@ class Model(fluid.dygraph.Layer):
            except ValueError as err:
                if skip_mismatch:
                    warnings.warn(
-                        ("Skip loading for {}. ".format(key) + err.message))
+                        ("Skip loading for {}. ".format(key) + str(err)))
                    # reset optimizer when mismatch happens
                    reset_optimizer = True
                else:
@@ -896,36 +935,36 @@ class Model(fluid.dygraph.Layer):
        FIXME: add more comments and usage
        Args:
            train_data (Dataset|DataLoader): An iterable data loader is used for 
-                train. An instance of paddle.fluid.io.Dataset or 
+                train. An instance of paddle paddle.io.Dataset or 
-                paddle.fluid.io.Dataloader is recomended.
+                paddle.io.Dataloader is recomended. Default: None.
            eval_data (Dataset|DataLoader): An iterable data loader is used for
                evaluation at the end of epoch. If None, will not do evaluation. 
-                An instance of paddle.fluid.io.Dataset or paddle.fluid.io.Dataloader 
+                An instance of paddle.io.Dataset or paddle.io.Dataloader 
-                is recomended.
+                is recomended. Default: None.
            batch_size (int): Integer number. The batch size of train_data and eval_data. 
                When train_data and eval_data are both the instance of Dataloader, this 
-                parameter will be ignored.
+                parameter will be ignored. Default: 1.
-            epochs (int): Integer number. The number of epochs to train the model.
+            epochs (int): Integer number. The number of epochs to train the model. Default: 1.
            eval_freq (int): The frequency, in number of epochs, an evalutation
-                is performed.
+                is performed. Default: 1.
            log_freq (int): The frequency, in number of steps, the training logs
-                are printed.
+                are printed. Default: 10.
            save_dir(str|None): The directory to save checkpoint during training.
-                If None, will not save checkpoint.
+                If None, will not save checkpoint. Default: None.
-            save_freq (int): The frequency, in number of epochs, to save checkpoint.
+            save_freq (int): The frequency, in number of epochs, to save checkpoint. Default: 1.
            verbose (int): The verbosity mode, should be 0, 1, or 2.
-                0 = silent, 1 = progress bar, 2 = one line per epoch.
+                0 = silent, 1 = progress bar, 2 = one line per epoch. Default: 2.
            drop_last (bool): whether drop the last incomplete batch of train_data 
                when dataset size is not divisible by the batch size. When train_data 
-                is an instance of Dataloader, this parameter will be ignored.
+                is an instance of Dataloader, this parameter will be ignored. Default: False.
            shuffle (bool): whther to shuffle train_data. When train_data is an instance 
-                of Dataloader, this parameter will be ignored.
+                of Dataloader, this parameter will be ignored. Default: True.
            num_workers (int): the number of subprocess to load data, 0 for no subprocess 
                used and loading data in main process. When train_data and eval_data are
-                both the instance of Dataloader, this parameter will be ignored.
+                both the instance of Dataloader, this parameter will be ignored. Default: 0.
            callbacks (Callback|None): A list of `Callback` instances to apply
                during training. If None, `ProgBarLogger` and `ModelCheckpoint`
-                are automatically inserted.
+                are automatically inserted. Default: None.
        """
        assert train_data is not None, \
@@ -1024,21 +1063,23 @@ class Model(fluid.dygraph.Layer):
        FIXME: add more comments and usage
        Args:
            eval_data (Dataset|DataLoader): An iterable data loader is used for
-                evaluation. An instance of paddle.fluid.io.Dataset or 
+                evaluation. An instance of paddle.io.Dataset or 
-                paddle.fluid.io.Dataloader is recomended.
+                paddle.io.Dataloader is recomended.
            batch_size (int): Integer number. The batch size of train_data and eval_data. 
-                When train_data and eval_data are both the instance of Dataloader, this 
+                When eval_data is the instance of Dataloader, this argument will be ignored.
-                parameter will be ignored.
+                Default: 1.
            log_freq (int): The frequency, in number of steps, the eval logs
-                are printed.
+                are printed. Default: 10.
            verbose (int): The verbosity mode, should be 0, 1, or 2.
-                0 = silent, 1 = progress bar, 2 = one line per epoch.
+                0 = silent, 1 = progress bar, 2 = one line per epoch. Default: 2.
            num_workers (int): The number of subprocess to load data, 0 for no subprocess 
                used and loading data in main process. When train_data and eval_data are
-                both the instance of Dataloader, this parameter will be ignored.
+                both the instance of Dataloader, this parameter will be ignored. Default: 0.
            callbacks (Callback|None): A list of `Callback` instances to apply
                during training. If None, `ProgBarLogger` and `ModelCheckpoint`
-                are automatically inserted.
+                are automatically inserted. Default: None.
+        Returns:
+            dict: Result of metric.
        """
        if fluid.in_dygraph_mode():
@@ -1099,26 +1140,28 @@ class Model(fluid.dygraph.Layer):
        FIXME: add more comments and usage
        Args:
            test_data (Dataset|DataLoader): An iterable data loader is used for
-                predict. An instance of paddle.fluid.io.Dataset or paddle.fluid.io.Dataloader 
+                predict. An instance of paddle.io.Dataset or paddle.io.Dataloader 
                is recomended.
            batch_size (int): Integer number. The batch size of train_data and eval_data. 
                When train_data and eval_data are both the instance of Dataloader, this 
-                parameter will be ignored.
+                argument will be ignored. Default: 1.
            num_workers (int): the number of subprocess to load data, 0 for no subprocess 
                used and loading data in main process. When train_data and eval_data are
-                both the instance of Dataloader, this parameter will be ignored.
+                both the instance of Dataloader, this argument will be ignored. Default: 0.
            stack_output (bool): whether stack output field like a batch, as for an output
                filed of a sample is in shape [X, Y], test_data contains N samples, predict
                output field will be in shape [N, X, Y] if stack_output is True, and will
                be a length N list in shape [[X, Y], [X, Y], ....[X, Y]] if stack_outputs
                is False. stack_outputs as False is used for LoDTensor output situation,
-                it is recommended set as True if outputs contains no LoDTensor. Default False
+                it is recommended set as True if outputs contains no LoDTensor. Default: False.
+        Returns:
+            list: output of models.
        """
        if fluid.in_dygraph_mode():
            feed_list = None
        else:
-            feed_list = [x.forward() for x in self._inputs + self._labels]
+            feed_list = [x.forward() for x in self._inputs]
        if test_data is not None and isinstance(test_data, Dataset):
            test_sampler = DistributedBatchSampler(
@@ -1142,7 +1185,7 @@ class Model(fluid.dygraph.Layer):
        outputs = []
        for data in tqdm.tqdm(loader):
            data = flatten(data)
-            outputs.append(self.test(data[:len(self._inputs)]))
+            outputs.append(self.test_batch(data[:len(self._inputs)]))
        # NOTE: for lod tensor output, we should not stack outputs
        # for stacking may loss its detail info
@@ -1156,18 +1199,6 @@ class Model(fluid.dygraph.Layer):
            outputs = [o[:len(test_loader.dataset)] for o in outputs]
        return outputs
-    def set_eval_data(self, eval_data):
-        """
-        Args:
-            eval_data (Dataset|DataLoader|None): An iterable data loader is used for 
-                eval. An instance of paddle.fluid.io.Dataset or 
-                paddle.fluid.io.Dataloader is recomended. 
-        """
-        assert isinstance(
-            eval_data,
-            DataLoader), "eval_data must be a instance of Dataloader!"
-        self._test_dataloader = eval_data
    def _run_one_epoch(self,
                       data_loader,
                       callbacks,
@@ -1204,11 +1235,11 @@ class Model(fluid.dygraph.Layer):
            callbacks.on_batch_begin(mode, step, logs)
            if mode == 'train':
-                outs = self.train(data[:len(self._inputs)],
+                outs = self.train_batch(data[:len(self._inputs)],
-                                  data[len(self._inputs):])
+                                        data[len(self._inputs):])
            else:
-                outs = self.eval(data[:len(self._inputs)],
+                outs = self.eval_batch(data[:len(self._inputs)],
-                                 data[len(self._inputs):])
+                                       data[len(self._inputs):])
            # losses
            loss = outs[0] if self._metrics else outs
@@ -1236,7 +1267,7 @@ class Model(fluid.dygraph.Layer):
        if mode == 'train':
            assert epoch is not None, 'when mode is train, epoch must be given'
-            callbacks.on_epoch_end(epoch)
+            callbacks.on_epoch_end(epoch, logs)
        return logs

--- a/progressbar.py
+++ b/progressbar.py
--- a/hapi/text/__init__.py
+++ b/hapi/text/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from hapi.text.text import RNNCell as RNNCell
+from hapi.text.text import BasicLSTMCell as BasicLSTMCell
+from hapi.text.text import BasicGRUCell as BasicGRUCell
+from hapi.text.text import RNN as RNN
+from hapi.text.text import DynamicDecode as DynamicDecode
+from hapi.text.text import BeamSearchDecoder as BeamSearchDecoder
+from hapi.text.text import MultiHeadAttention as MultiHeadAttention
+from hapi.text.text import FFN as FFN
+from hapi.text.text import TransformerEncoderLayer as TransformerEncoderLayer
+from hapi.text.text import TransformerDecoderLayer as TransformerDecoderLayer
+from hapi.text.text import TransformerEncoder as TransformerEncoder
+from hapi.text.text import TransformerDecoder as TransformerDecoder
+from hapi.text.text import TransformerBeamSearchDecoder as TransformerBeamSearchDecoder
+from hapi.text.text import GRUCell as GRUCell
+from hapi.text.text import GRUEncoderCell as GRUEncoderCell
+from hapi.text.text import BiGRU as BiGRU
+from hapi.text.text import Linear_chain_crf as Linear_chain_crf
+from hapi.text.text import Crf_decoding as Crf_decoding
+from hapi.text.text import SequenceTagging as SequenceTagging
--- a/hapi/text/bert/__init__.py
+++ b/hapi/text/bert/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from hapi.text.bert.bert import BertConfig as BertConfig
+from hapi.text.bert.optimization import Optimizer as Optimizer
+from hapi.text.bert.dataloader import BertDataLoader as BertDataLoader
+from hapi.text.bert.dataloader import BertInputExample as BertInputExample
+from hapi.text.tokenizer import tokenization as tokenization
+from hapi.text.bert.bert import BertEncoder as BertEncoder
--- a/hapi/text/bert/batching.py
+++ b/hapi/text/bert/batching.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Mask, padding and batching."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
+    """
+    Add mask for batch_tokens, return out, mask_label, mask_pos;
+    Note: mask_pos responding the batch_tokens after padded;
+    """
+    max_len = max([len(sent) for sent in batch_tokens])
+    mask_label = []
+    mask_pos = []
+    prob_mask = np.random.rand(total_token_num)
+    # Note: the first token is [CLS], so [low=1]
+    replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
+    pre_sent_len = 0
+    prob_index = 0
+    for sent_index, sent in enumerate(batch_tokens):
+        mask_flag = False
+        prob_index += pre_sent_len
+        for token_index, token in enumerate(sent):
+            prob = prob_mask[prob_index + token_index]
+            if prob > 0.15:
+                continue
+            elif 0.03 < prob <= 0.15:
+                # mask
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    sent[token_index] = MASK
+                    mask_flag = True
+                    mask_pos.append(sent_index * max_len + token_index)
+            elif 0.015 < prob <= 0.03:
+                # random replace
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    sent[token_index] = replace_ids[prob_index + token_index]
+                    mask_flag = True
+                    mask_pos.append(sent_index * max_len + token_index)
+            else:
+                # keep the original token
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    mask_pos.append(sent_index * max_len + token_index)
+        pre_sent_len = len(sent)
+        # ensure at least mask one word in a sentence
+        while not mask_flag:
+            token_index = int(np.random.randint(1, high=len(sent) - 1, size=1))
+            if sent[token_index] != SEP and sent[token_index] != CLS:
+                mask_label.append(sent[token_index])
+                sent[token_index] = MASK
+                mask_flag = True
+                mask_pos.append(sent_index * max_len + token_index)
+    mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
+    mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
+    return batch_tokens, mask_label, mask_pos
+def prepare_batch_data(insts,
+                       total_token_num,
+                       voc_size=0,
+                       pad_id=None,
+                       cls_id=None,
+                       sep_id=None,
+                       mask_id=None,
+                       return_input_mask=True,
+                       return_max_len=True,
+                       return_num_token=False):
+    """
+    1. generate Tensor of data
+    2. generate Tensor of position
+    3. generate self attention mask, [shape: batch_size *  max_len * max_len]
+    """
+    batch_src_ids = [inst[0] for inst in insts]
+    batch_pos_ids = [inst[1] for inst in insts]
+    batch_sent_ids = [inst[2] for inst in insts]
+    labels_list = []
+    # compatible with squad, whose example includes start/end positions, 
+    # or unique id
+    for i in range(3, len(insts[0]), 1):
+        labels = [inst[i] for inst in insts]
+        labels = np.array(labels).astype("int64").reshape([-1, 1])
+        labels_list.append(labels)
+    # First step: do mask without padding
+    if mask_id >= 0:
+        out, mask_label, mask_pos = mask(
+            batch_src_ids,
+            total_token_num,
+            vocab_size=voc_size,
+            CLS=cls_id,
+            SEP=sep_id,
+            MASK=mask_id)
+    else:
+        out = batch_src_ids
+    # Second step: padding
+    src_id, self_input_mask = pad_batch_data(
+        out, pad_idx=pad_id, return_input_mask=True)
+    pos_id = pad_batch_data(
+        batch_pos_ids,
+        pad_idx=pad_id,
+        return_pos=False,
+        return_input_mask=False)
+    sent_id = pad_batch_data(
+        batch_sent_ids,
+        pad_idx=pad_id,
+        return_pos=False,
+        return_input_mask=False)
+    if mask_id >= 0:
+        return_list = [
+            src_id, pos_id, sent_id, self_input_mask, mask_label, mask_pos
+        ] + labels_list
+    else:
+        return_list = [src_id, pos_id, sent_id, self_input_mask] + labels_list
+    return return_list if len(return_list) > 1 else return_list[0]
+def pad_batch_data(insts,
+                   pad_idx=0,
+                   return_pos=False,
+                   return_input_mask=False,
+                   return_max_len=False,
+                   return_num_token=False):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and input mask.
+    """
+    return_list = []
+    max_len = max(len(inst) for inst in insts)
+    # Any token included in dict can be used to pad, since the paddings' loss
+    # will be masked out by weights and make no effect on parameter gradients.
+    inst_data = np.array([
+        list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts
+    ])
+    return_list += [inst_data.astype("int64").reshape([-1, max_len])]
+    # position data
+    if return_pos:
+        inst_pos = np.array([
+            list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
+            for inst in insts
+        ])
+        return_list += [inst_pos.astype("int64").reshape([-1, max_len])]
+    if return_input_mask:
+        # This is used to avoid attention on paddings.
+        input_mask_data = np.array([[1] * len(inst) + [0] *
+                                    (max_len - len(inst)) for inst in insts])
+        input_mask_data = np.expand_dims(input_mask_data, axis=-1)
+        return_list += [input_mask_data.astype("float32")]
+    if return_max_len:
+        return_list += [max_len]
+    if return_num_token:
+        num_token = 0
+        for inst in insts:
+            num_token += len(inst)
+        return_list += [num_token]
+    return return_list if len(return_list) > 1 else return_list[0]
+if __name__ == "__main__":
+    pass
--- a/hapi/text/bert/bert.py
+++ b/hapi/text/bert/bert.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"bert"
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import six
+import json
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, to_variable, Layer, guard
+from hapi.text.text import PrePostProcessLayer, TransformerEncoder
+from hapi.text.bert.utils.init import init_from_static_model
+class BertConfig(object):
+    def __init__(self, config_path):
+        self._config_dict = self._parse(config_path)
+    def _parse(self, config_path):
+        try:
+            with open(config_path) as json_file:
+                config_dict = json.load(json_file)
+        except Exception:
+            raise IOError("Error in parsing bert model config file '%s'" %
+                          config_path)
+        else:
+            return config_dict
+    def __getitem__(self, key):
+        return self._config_dict[key]
+    def print_config(self):
+        for arg, value in sorted(six.iteritems(self._config_dict)):
+            print('%s: %s' % (arg, value))
+        print('------------------------------------------------')
+class BertEncoder(Layer):
+    """
+    bert
+    """
+    def __init__(self, config, return_pooled_out=True, use_fp16=False):
+        super(BertEncoder, self).__init__()
+        self.config = config
+        self._emb_size = config['hidden_size']
+        self._n_layer = config['num_hidden_layers']
+        self._n_head = config['num_attention_heads']
+        self._voc_size = config['vocab_size']
+        self._max_position_seq_len = config['max_position_embeddings']
+        self._sent_types = config['type_vocab_size']
+        self._hidden_act = config['hidden_act']
+        self._prepostprocess_dropout = config['hidden_dropout_prob']
+        self._attention_dropout = config['attention_probs_dropout_prob']
+        self.return_pooled_out = return_pooled_out
+        self._word_emb_name = "word_embedding"
+        self._pos_emb_name = "pos_embedding"
+        self._sent_emb_name = "sent_embedding"
+        self._dtype = "float16" if use_fp16 else "float32"
+        self._param_initializer = fluid.initializer.TruncatedNormal(
+            scale=config['initializer_range'])
+        self._src_emb = Embedding(
+            size=[self._voc_size, self._emb_size],
+            param_attr=fluid.ParamAttr(
+                name=self._word_emb_name, initializer=self._param_initializer),
+            dtype=self._dtype)
+        self._pos_emb = Embedding(
+            size=[self._max_position_seq_len, self._emb_size],
+            param_attr=fluid.ParamAttr(
+                name=self._pos_emb_name, initializer=self._param_initializer),
+            dtype=self._dtype)
+        self._sent_emb = Embedding(
+            size=[self._sent_types, self._emb_size],
+            param_attr=fluid.ParamAttr(
+                name=self._sent_emb_name, initializer=self._param_initializer),
+            dtype=self._dtype)
+        self.pooled_fc = Linear(
+            input_dim=self._emb_size,
+            output_dim=self._emb_size,
+            param_attr=fluid.ParamAttr(
+                name="pooled_fc.w_0", initializer=self._param_initializer),
+            bias_attr="pooled_fc.b_0",
+            act="tanh")
+        self.pre_process_layer = PrePostProcessLayer(
+            "nd", self._emb_size, self._prepostprocess_dropout, None)
+        self._encoder = TransformerEncoder(
+            n_layer=self._n_layer,
+            n_head=self._n_head,
+            d_key=self._emb_size // self._n_head,
+            d_value=self._emb_size // self._n_head,
+            d_model=self._emb_size,
+            d_inner_hid=self._emb_size * 4,
+            prepostprocess_dropout=self._prepostprocess_dropout,
+            attention_dropout=self._attention_dropout,
+            relu_dropout=0,
+            preprocess_cmd="",
+            postprocess_cmd="dan",
+            ffn_fc1_act=self._hidden_act)
+    def init_parameters(self, param_path="", verbose=False):
+        init_from_static_model(param_path, self, self.config, verbose)
+    def forward(self, src_ids, position_ids, sentence_ids, input_mask):
+        """
+        forward
+        """
+        src_emb = self._src_emb(src_ids)
+        pos_emb = self._pos_emb(position_ids)
+        sent_emb = self._sent_emb(sentence_ids)
+        emb_out = src_emb + pos_emb
+        emb_out = emb_out + sent_emb
+        emb_out = self.pre_process_layer(emb_out)
+        self_attn_mask = fluid.layers.matmul(
+            x=input_mask, y=input_mask, transpose_y=True)
+        self_attn_mask = fluid.layers.scale(
+            x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False)
+        n_head_self_attn_mask = fluid.layers.stack(
+            x=[self_attn_mask] * self._n_head, axis=1)
+        n_head_self_attn_mask.stop_gradient = True
+        enc_output = self._encoder(emb_out, n_head_self_attn_mask)
+        if not self.return_pooled_out:
+            return enc_output
+        next_sent_feat = fluid.layers.slice(
+            input=enc_output, axes=[1], starts=[0], ends=[1])
+        next_sent_feat = self.pooled_fc(next_sent_feat)
+        next_sent_feat = fluid.layers.reshape(
+            next_sent_feat, shape=[-1, self._emb_size])
+        return enc_output, next_sent_feat
--- a/hapi/text/bert/data_processor.py
+++ b/hapi/text/bert/data_processor.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+import os
+import types
+import csv
+import numpy as np
+import hapi.text.tokenizer.tokenization as tokenization
+from hapi.text.bert.batching import prepare_batch_data
+class DataProcessor(object):
+    """Base class for data converters for sequence classification data sets."""
+    def __init__(self, tokenizer, max_seq_len, in_tokens, random_seed=None):
+        self.max_seq_len = max_seq_len
+        self.tokenizer = tokenizer
+        self.vocab = self.tokenizer.vocab
+        self.in_tokens = in_tokens
+        np.random.seed(random_seed)
+        self.current_train_example = -1
+        self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
+        self.current_train_epoch = -1
+    def get_train_iter(self,
+                       data_dir,
+                       epoch_num=1,
+                       shuffle=True,
+                       shuffle_seed=None):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+    def get_dev_iter(self, data_dir):
+        """Gets a collection of `InputExample`s for the dev set."""
+        raise NotImplementedError()
+    def get_test_iter(self, data_dir):
+        """Gets a collection of `InputExample`s for prediction."""
+        raise NotImplementedError()
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+    def convert_example(self, index, example, labels, max_seq_len, tokenizer):
+        """Converts a single `InputExample` into a single `InputFeatures`."""
+        feature = convert_single_example(index, example, labels, max_seq_len,
+                                         tokenizer)
+        return feature
+    def _read_tsv(cls, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with io.open(input_file, "r", encoding="utf8") as f:
+            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+            lines = []
+            for line in reader:
+                lines.append(line)
+            return lines
+    def generate_instance(self, feature):
+        """
+        generate instance with given feature
+        Args:
+            feature: InputFeatures(object). A single set of features of data.
+        """
+        input_pos = list(range(len(feature.input_ids)))
+        return [
+            feature.input_ids, feature.segment_ids, input_pos, feature.label_id
+        ]
+    def generate_batch_data(self,
+                            batch_data,
+                            total_token_num,
+                            voc_size=-1,
+                            mask_id=-1,
+                            return_input_mask=True,
+                            return_max_len=False,
+                            return_num_token=False):
+        return prepare_batch_data(
+            batch_data,
+            total_token_num,
+            voc_size=-1,
+            pad_id=self.vocab["[PAD]"],
+            cls_id=self.vocab["[CLS]"],
+            sep_id=self.vocab["[SEP]"],
+            mask_id=-1,
+            return_input_mask=True,
+            return_max_len=False,
+            return_num_token=False)
+    def get_num_examples(self, phase):
+        """Get number of examples for train, dev or test."""
+        if phase not in ['train', 'dev', 'test']:
+            raise ValueError(
+                "Unknown phase, which should be in ['train', 'dev', 'test'].")
+        if phase == 'train':
+            return len(self.train_examples)
+        elif phase == 'dev':
+            return len(self.dev_examples)
+        elif phase == 'test':
+            return len(self.test_examples)
+        else:
+            raise ValueError(
+                "Unknown phase, which should be in ['train', 'dev', 'test'].")
+    def get_train_progress(self):
+        """Gets progress for training phase."""
+        return self.current_train_example, self.current_train_epoch
+    def data_generator(self, data_iter, batch_size, phase='train',
+                       dev_count=1):
+        """
+        Generate data for train, dev or test.
+        Args:
+          batch_size: int. The batch size of generated data.
+          phase: string. The phase for which to generate data.
+        """
+        assert phase in ['train', 'dev', 'test']
+        if phase == 'train':
+            sample_num = len(self.train_examples)
+        elif phase == 'dev':
+            sample_num = len(self.dev_examples)
+        elif phase == 'test':
+            sample_num = len(self.test_examples)
+        else:
+            sample_num = -1
+        self.num_examples[phase] = sample_num
+        def instance_reader():
+            for epoch_idx, example_idx, example in data_iter():
+                if phase == 'train':
+                    self.current_train_epoch = epoch_idx
+                    self.current_train_example = example_idx
+                feature = self.convert_example(
+                    example_idx, example,
+                    self.get_labels(), self.max_seq_len, self.tokenizer)
+                instance = self.generate_instance(feature)
+                yield instance
+        def batch_reader(reader, batch_size, in_tokens):
+            batch, total_token_num, max_len = [], 0, 0
+            for instance in reader():
+                token_ids, sent_ids, pos_ids, label = instance[:4]
+                max_len = max(max_len, len(token_ids))
+                if in_tokens:
+                    to_append = (len(batch) + 1) * max_len <= batch_size
+                else:
+                    to_append = len(batch) < batch_size
+                if to_append:
+                    batch.append(instance)
+                    total_token_num += len(token_ids)
+                else:
+                    yield batch, total_token_num
+                    batch, total_token_num, max_len = [instance], len(
+                        token_ids), len(token_ids)
+            if len(batch) > 0:
+                yield batch, total_token_num
+        def wrapper():
+            all_dev_batches = []
+            for batch_data, total_token_num in batch_reader(
+                    instance_reader, batch_size, self.in_tokens):
+                batch_data = self.generate_batch_data(
+                    batch_data,
+                    total_token_num,
+                    voc_size=-1,
+                    mask_id=-1,
+                    return_input_mask=True,
+                    return_max_len=False,
+                    return_num_token=False)
+                if len(all_dev_batches) < dev_count:
+                    all_dev_batches.append(batch_data)
+                if len(all_dev_batches) == dev_count:
+                    for batch in all_dev_batches:
+                        yield batch
+                    all_dev_batches = []
+        return wrapper
+class InputExample(object):
+    """A single training/test example for simple sequence classification."""
+    def __init__(self, guid, text_a, text_b=None, label=None):
+        """Constructs a InputExample.
+    Args:
+      guid: Unique id for the example.
+      text_a: string. The untokenized text of the first sequence. For single
+        sequence tasks, only this sequence must be specified.
+      text_b: (Optional) string. The untokenized text of the second sequence.
+        Only must be specified for sequence pair tasks.
+      label: (Optional) string. The label of the example. This should be
+        specified for train and dev examples, but not for test examples.
+    """
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+class InputFeatures(object):
+    """A single set of features of data."""
+    def __init__(self, input_ids, input_mask, segment_ids, label_id):
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.label_id = label_id
+class XnliProcessor(DataProcessor):
+    """Processor for the XNLI data set."""
+    def get_train_iter(self,
+                       data_dir,
+                       epoch_num=1,
+                       shuffle=True,
+                       shuffle_seed=None):
+        """See base class."""
+        self.language = "zh"
+        lines = self._read_tsv(
+            os.path.join(data_dir, "multinli", "multinli.train.%s.tsv" %
+                         self.language))
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "train-%d" % (i)
+            text_a = tokenization.convert_to_unicode(line[0])
+            text_b = tokenization.convert_to_unicode(line[1])
+            label = tokenization.convert_to_unicode(line[2])
+            if label == tokenization.convert_to_unicode("contradictory"):
+                label = tokenization.convert_to_unicode("contradiction")
+            examples.append(
+                InputExample(
+                    guid=guid, text_a=text_a, text_b=text_b, label=label))
+        self.train_examples = examples
+        def wrapper():
+            if shuffle:
+                if shuffle_seed is not None:
+                    np.random.seed(shuffle_seed)
+            for epoch_idx in range(epoch_num):
+                if shuffle:
+                    np.random.shuffle(examples)
+                for (example_idx, example) in enumerate(examples):
+                    yield epoch_idx, example_idx, example
+        return wrapper
+    def get_dev_iter(self, data_dir):
+        """See base class."""
+        self.language = "zh"
+        lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "dev-%d" % (i)
+            language = tokenization.convert_to_unicode(line[0])
+            if language != tokenization.convert_to_unicode(self.language):
+                continue
+            text_a = tokenization.convert_to_unicode(line[6])
+            text_b = tokenization.convert_to_unicode(line[7])
+            label = tokenization.convert_to_unicode(line[1])
+            examples.append(
+                InputExample(
+                    guid=guid, text_a=text_a, text_b=text_b, label=label))
+        self.dev_examples = examples
+        def wrapper():
+            for (example_idx, example) in enumerate(examples):
+                yield 0, example_idx, example
+        return wrapper
+    def get_test_iter(self, data_dir):
+        """See base class."""
+        self.language = "zh"
+        lines = self._read_tsv(os.path.join(data_dir, "xnli.test.tsv"))
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "test-%d" % (i)
+            language = tokenization.convert_to_unicode(line[0])
+            if language != tokenization.convert_to_unicode(self.language):
+                continue
+            text_a = tokenization.convert_to_unicode(line[6])
+            text_b = tokenization.convert_to_unicode(line[7])
+            label = tokenization.convert_to_unicode(line[1])
+            examples.append(
+                InputExample(
+                    guid=guid, text_a=text_a, text_b=text_b, label=label))
+        self.test_examples = examples
+        def wrapper():
+            for (example_idx, example) in enumerate(examples):
+                yield 0, example_idx, example
+        return wrapper
+    def get_labels(self):
+        """See base class."""
+        return ["contradiction", "entailment", "neutral"]
+class MnliProcessor(DataProcessor):
+    """Processor for the MultiNLI data set (GLUE version)."""
+    def get_train_iter(self,
+                       data_dir,
+                       epoch_num=1,
+                       shuffle=True,
+                       shuffle_seed=None):
+        """See base class."""
+        examples = self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+        self.train_examples = examples
+        def wrapper():
+            if shuffle:
+                if shuffle_seed is not None:
+                    np.random.seed(shuffle_seed)
+            for epoch_idx in range(epoch_num):
+                if shuffle:
+                    np.random.shuffle(examples)
+                for (example_idx, example) in enumerate(examples):
+                    yield epoch_idx, example_idx, example
+        return wrapper
+    def get_dev_iter(self, data_dir):
+        """See base class."""
+        examples = self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
+            "dev_matched")
+        self.dev_examples = examples
+        def wrapper():
+            for (example_idx, example) in enumerate(examples):
+                yield 0, example_idx, example
+        return wrapper
+    def get_test_iter(self, data_dir):
+        """See base class."""
+        examples = self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test")
+        self.test_examples = examples
+        def wrapper():
+            for (example_idx, example) in enumerate(examples):
+                yield 0, example_idx, example
+        return wrapper
+    def get_labels(self):
+        """See base class."""
+        return ["contradiction", "entailment", "neutral"]
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type,
+                              tokenization.convert_to_unicode(line[0]))
+            text_a = tokenization.convert_to_unicode(line[8])
+            text_b = tokenization.convert_to_unicode(line[9])
+            if set_type == "test":
+                label = "contradiction"
+            else:
+                label = tokenization.convert_to_unicode(line[-1])
+            examples.append(
+                InputExample(
+                    guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+class MrpcProcessor(DataProcessor):
+    """Processor for the MRPC data set (GLUE version)."""
+    def get_train_iter(self,
+                       data_dir,
+                       epoch_num=1,
+                       shuffle=True,
+                       shuffle_seed=None):
+        """See base class."""
+        examples = self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+        self.train_examples = examples
+        def wrapper():
+            if shuffle:
+                if shuffle_seed is not None:
+                    np.random.seed(shuffle_seed)
+            for epoch_idx in range(epoch_num):
+                if shuffle:
+                    np.random.shuffle(examples)
+                for (example_idx, example) in enumerate(examples):
+                    yield epoch_idx, example_idx, example
+        return wrapper
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        examples = self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+        self.dev_examples = examples
+        def wrapper():
+            for (example_idx, example) in enumerate(examples):
+                yield 0, example_idx, example
+        return wrapper
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        examples = self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+        self.test_examples = examples
+        def wrapper():
+            for (example_idx, example) in enumerate(examples):
+                yield 0, example_idx, example
+        return wrapper
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            text_a = tokenization.convert_to_unicode(line[3])
+            text_b = tokenization.convert_to_unicode(line[4])
+            if set_type == "test":
+                label = "0"
+            else:
+                label = tokenization.convert_to_unicode(line[0])
+            examples.append(
+                InputExample(
+                    guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+class ColaProcessor(DataProcessor):
+    """Processor for the CoLA data set (GLUE version)."""
+    def get_train_iter(self,
+                       data_dir,
+                       epoch_num=1,
+                       shuffle=True,
+                       shuffle_seed=None):
+        """See base class."""
+        examples = self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+        self.train_examples = examples
+        def wrapper():
+            if shuffle:
+                if shuffle_seed is not None:
+                    np.random.seed(shuffle_seed)
+            for epoch_idx in range(epoch_num):
+                if shuffle:
+                    np.random.shuffle(examples)
+                for (example_idx, example) in enumerate(examples):
+                    yield epoch_idx, example_idx, example
+        return wrapper
+    def get_dev_iter(self, data_dir):
+        """See base class."""
+        examples = self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+        self.dev_examples = examples
+        def wrapper():
+            for (example_idx, example) in enumerate(examples):
+                yield 0, example_idx, example
+        return wrapper
+    def get_test_iter(self, data_dir):
+        """See base class."""
+        examples = self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+        self.test_examples = examples
+        def wrapper():
+            for (example_idx, example) in enumerate(examples):
+                yield 0, example_idx, example
+        return wrapper
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            # Only the test set has a header
+            if set_type == "test" and i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            if set_type == "test":
+                text_a = tokenization.convert_to_unicode(line[1])
+                label = "0"
+            else:
+                text_a = tokenization.convert_to_unicode(line[3])
+                label = tokenization.convert_to_unicode(line[1])
+            examples.append(
+                InputExample(
+                    guid=guid, text_a=text_a, text_b=None, label=label))
+        return examples
+def convert_single_example_to_unicode(guid, single_example):
+    text_a = tokenization.convert_to_unicode(single_example[0])
+    text_b = tokenization.convert_to_unicode(single_example[1])
+    label = tokenization.convert_to_unicode(single_example[2])
+    return InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
+def convert_single_example(ex_index, example, label_list, max_seq_length,
+                           tokenizer):
+    """Converts a single `InputExample` into a single `InputFeatures`."""
+    label_map = {}
+    for (i, label) in enumerate(label_list):
+        label_map[label] = i
+    tokens_a = tokenizer.tokenize(example.text_a)
+    tokens_b = None
+    if example.text_b:
+        tokens_b = tokenizer.tokenize(example.text_b)
+    if tokens_b:
+        # Modifies `tokens_a` and `tokens_b` in place so that the total
+        # length is less than the specified length.
+        # Account for [CLS], [SEP], [SEP] with "- 3"
+        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+    else:
+        # Account for [CLS] and [SEP] with "- 2"
+        if len(tokens_a) > max_seq_length - 2:
+            tokens_a = tokens_a[0:(max_seq_length - 2)]
+    # The convention in BERT is:
+    # (a) For sequence pairs:
+    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+    #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
+    # (b) For single sequences:
+    #  tokens:   [CLS] the dog is hairy . [SEP]
+    #  type_ids: 0     0   0   0  0     0 0
+    #
+    # Where "type_ids" are used to indicate whether this is the first
+    # sequence or the second sequence. The embedding vectors for `type=0` and
+    # `type=1` were learned during pre-training and are added to the wordpiece
+    # embedding vector (and position vector). This is not *strictly* necessary
+    # since the [SEP] token unambiguously separates the sequences, but it makes
+    # it easier for the model to learn the concept of sequences.
+    #
+    # For classification tasks, the first vector (corresponding to [CLS]) is
+    # used as as the "sentence vector". Note that this only makes sense because
+    # the entire model is fine-tuned.
+    tokens = []
+    segment_ids = []
+    tokens.append("[CLS]")
+    segment_ids.append(0)
+    for token in tokens_a:
+        tokens.append(token)
+        segment_ids.append(0)
+    tokens.append("[SEP]")
+    segment_ids.append(0)
+    if tokens_b:
+        for token in tokens_b:
+            tokens.append(token)
+            segment_ids.append(1)
+        tokens.append("[SEP]")
+        segment_ids.append(1)
+    input_ids = tokenizer.convert_tokens_to_ids(tokens)
+    # The mask has 1 for real tokens and 0 for padding tokens. Only real
+    # tokens are attended to.
+    input_mask = [1] * len(input_ids)
+    label_id = label_map[example.label]
+    feature = InputFeatures(
+        input_ids=input_ids,
+        input_mask=input_mask,
+        segment_ids=segment_ids,
+        label_id=label_id)
+    return feature
+def convert_examples_to_features(examples, label_list, max_seq_length,
+                                 tokenizer):
+    """Convert a set of `InputExample`s to a list of `InputFeatures`."""
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        if ex_index % 10000 == 0:
+            print("Writing example %d of %d" % (ex_index, len(examples)))
+        feature = convert_single_example(ex_index, example, label_list,
+                                         max_seq_length, tokenizer)
+        features.append(feature)
+    return features
+if __name__ == '__main__':
+    print("hello world")
+    pass
--- a/hapi/text/bert/dataloader.py
+++ b/hapi/text/bert/dataloader.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+import os
+import six
+import csv
+import glob
+import tarfile
+import itertools
+import leveldb
+from functools import partial
+import numpy as np
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.parallel import ParallelEnv
+from paddle.io import BatchSampler, DataLoader, Dataset
+from hapi.distributed import DistributedBatchSampler
+from hapi.text.bert.data_processor import DataProcessor, XnliProcessor, ColaProcessor, MrpcProcessor, MnliProcessor
+from hapi.text.bert.batching import prepare_batch_data
+import hapi.text.tokenizer.tokenization as tokenization
+from paddle.fluid.dygraph.parallel import ParallelEnv, ParallelStrategy
+__all__ = [
+    'BertInputExample', 'BertInputFeatures', 'SingleSentenceDataset',
+    'SentencePairDataset', 'BertDataLoader'
+]
+class BertInputExample(object):
+    def __init__(self, uid, text_a, text_b=None, label=None):
+        self.uid = uid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+class BertInputFeatures(object):
+    def __init__(self, input_ids, input_mask, segment_ids, label_id):
+        self.input_ids = input_ids
+        self.pos_ids = list(range(len(self.input_ids)))
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.label_id = label_id
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+def convert_single_example_to_unicode(guid, single_example):
+    text_a = tokenization.convert_to_unicode(single_example[0])
+    text_b = tokenization.convert_to_unicode(single_example[1])
+    label = tokenization.convert_to_unicode(single_example[2])
+    return BertInputExample(uid=uid, text_a=text_a, text_b=text_b, label=label)
+def convert_single_example(ex_index, example, label_list, max_seq_length,
+                           tokenizer):
+    """Converts a single `BertInputExample` into a single `BertInputFeatures`."""
+    label_map = {}
+    for (i, label) in enumerate(label_list):
+        label_map[label] = i
+    tokens_a = tokenizer.tokenize(example.text_a)
+    tokens_b = None
+    if example.text_b:
+        tokens_b = tokenizer.tokenize(example.text_b)
+    if tokens_b:
+        # Modifies `tokens_a` and `tokens_b` in place so that the total
+        # length is less than the specified length.
+        # Account for [CLS], [SEP], [SEP] with "- 3"
+        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+    else:
+        # Account for [CLS] and [SEP] with "- 2"
+        if len(tokens_a) > max_seq_length - 2:
+            tokens_a = tokens_a[0:(max_seq_length - 2)]
+    tokens = []
+    segment_ids = []
+    tokens.append("[CLS]")
+    segment_ids.append(0)
+    for token in tokens_a:
+        tokens.append(token)
+        segment_ids.append(0)
+    tokens.append("[SEP]")
+    segment_ids.append(0)
+    if tokens_b:
+        for token in tokens_b:
+            tokens.append(token)
+            segment_ids.append(1)
+        tokens.append("[SEP]")
+        segment_ids.append(1)
+    input_ids = tokenizer.convert_tokens_to_ids(tokens)
+    input_mask = [1] * len(input_ids)
+    label_id = label_map[example.label]
+    feature = BertInputFeatures(
+        input_ids=input_ids,
+        input_mask=input_mask,
+        segment_ids=segment_ids,
+        label_id=label_id)
+    return feature
+def convert_examples_to_features(examples, label_list, max_seq_length,
+                                 tokenizer):
+    """Convert a set of `InputExample`s to a list of `InputFeatures`."""
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        if ex_index % 10000 == 0:
+            print("Writing example %d of %d" % (ex_index, len(examples)))
+        feature = convert_single_example(ex_index, example, label_list,
+                                         max_seq_length, tokenizer)
+        features.append(feature)
+    return features
+def _read_tsv(input_file, delimiter="\t", quotechar=None):
+    """Reads a tab separated value file."""
+    with io.open(input_file, "r", encoding="utf8") as f:
+        reader = csv.reader(f, delimiter=delimiter, quotechar=quotechar)
+        lines = []
+        for line in reader:
+            lines.append(line)
+        return lines
+class SingleSentenceDataset(Dataset):
+    def __init__(self,
+                 tokenizer,
+                 label_list,
+                 max_seq_length,
+                 mode="all_in_memory"):
+        assert isinstance(mode,
+                          str), "mode of SingleSentenceDataset should be str"
+        assert mode in [
+            "all_in_memory", "leveldb", "streaming"
+        ], "mode of SingleSentenceDataset should be in [all_in_memory, leveldb, streaming], but get" % mode
+        self.delimiter = None
+        self.mode = mode
+        self.examples = []
+        self._db = None
+        self._line_processor = None
+    def load_all_data_in_memory(self,
+                                input_file,
+                                label_list,
+                                max_seq_length,
+                                tokenizer,
+                                line_processor=None,
+                                delimiter="\t",
+                                quotechar=None):
+        lines = _read_tsv(input_file, delimiter=delimiter, quotechar=quotechar)
+        def default_line_processor(line_id, line):
+            assert len(line) == 2
+            text_a = line[0]
+            label = line[1]
+            return BertInputExample(
+                str(line_id), text_a=text_a, text_b=None, label=label)
+        if line_processor is None:
+            line_processor = default_line_processor
+        for (line_id, line) in enumerate(lines):
+            input_example = line_processor(line_id, line)
+            if not input_example:
+                continue
+            input_feature = convert_single_example(
+                str(line_id), input_example, label_list, max_seq_length,
+                tokenizer)
+            self.examples.append(input_feature)
+    def prepare_leveldb(self,
+                        input_file,
+                        leveldb_file,
+                        label_list,
+                        max_seq_length,
+                        tokenizer,
+                        line_processor=None,
+                        delimiter="\t",
+                        quotechar=None):
+        def default_line_processor(line_id, line):
+            assert len(line) == 2
+            text_a = line[0]
+            label = line[1]
+            return BertInputExample(
+                str(line_id), text_a=text_a, text_b=None, label=label)
+        if line_processor is None:
+            line_processor = default_line_processor
+        if ParallelEnv().nranks > 1:
+            leveldb_file = leveldb_file + "_" + str(ParallelEnv().local_rank)
+        if not os.path.exists(leveldb_file):
+            print("putting data %s into leveldb %s" %
+                  (input_file, leveldb_file))
+            _example_num = 0
+            _db = leveldb.LevelDB(leveldb_file, create_if_missing=True)
+            with io.open(input_file, "r", encoding="utf8") as f:
+                reader = csv.reader(
+                    f, delimiter=delimiter, quotechar=quotechar)
+                line_id = 0
+                for (_line_id, line) in enumerate(reader):
+                    if line_processor(str(_line_id), line) is None:
+                        continue
+                    line_str = delimiter.join(line)
+                    _db.Put(
+                        str(line_id).encode("utf8"), line_str.encode("utf8"))
+                    line_id += 1
+                    _example_num += 1
+            _db.Put("_example_num_".encode("utf8"),
+                    str(_example_num).encode("utf8"))
+        else:
+            _db = leveldb.LevelDB(leveldb_file, create_if_missing=False)
+        self.label_list = label_list
+        self.max_seq_length = max_seq_length
+        self.tokenizer = tokenizer
+        self.delimiter = delimiter
+        self._db = _db
+        self._line_processor = line_processor
+    def __getitem__(self, idx):
+        if self.mode == "all_in_memory":
+            return self.examples[idx].input_ids, self.examples[
+                idx].pos_ids, self.examples[idx].segment_ids, self.examples[
+                    idx].label_id
+        if self.mode == "leveldb":
+            assert self._db is not None, "you shold call prepare_leveldb before you run dataloader"
+            line_str = self._db.Get(str(idx).encode("utf8"))
+            line_str = line_str.decode("utf8")
+            line = line_str.split(self.delimiter)
+            input_example = self._line_processor(str(idx + 1), line)
+            input_example = convert_single_example(
+                str(idx + 1), input_example, self.label_list,
+                self.max_seq_length, self.tokenizer)
+            return input_example.input_ids, input_example.pos_ids, input_example.segment_ids, input_example.label_id
+    def __len__(self):
+        if self.mode == "all_in_memory":
+            return len(self.examples)
+        if self.mode == "leveldb":
+            assert self._db is not None, "you shold call prepare_leveldb before you run dataloader"
+            exmaple_num = self._db.Get("_example_num_".encode("utf8"))
+            exmaple_num = exmaple_num.decode("utf8")
+            return int(exmaple_num)
+class SentencePairDataset(Dataset):
+    def __init__(self,
+                 tokenizer,
+                 label_ist,
+                 max_seq_length,
+                 mode="all_in_memory"):
+        assert isinstance(mode,
+                          str), "mode of SentencePairDataset should be str"
+        assert mode in [
+            "all_in_memory", "leveldb"
+        ], "mode of SentencePairDataset should be in [all_in_memory, leveldb], but get" % mode
+        self.examples = []
+    def load_all_data_in_memory(self,
+                                input_file,
+                                label_list,
+                                max_seq_length,
+                                tokenizer,
+                                line_processor=None,
+                                delimiter="\t",
+                                quotechar=None):
+        lines = _read_tsv(input_file, delimiter=delimiter, quotechar=quotechar)
+        def default_line_processor(line_id, line):
+            assert len(line) == 3
+            text_a = line[0]
+            text_b = line[1]
+            label = line[2]
+            return BertInputExample(
+                str(line_id), text_a=text_a, text_b=text_b, label=label)
+        if line_processor is None:
+            line_processor = default_line_processor
+        for (line_id, line) in enumerate(lines):
+            input_example = line_processor(line_id, line)
+            if not input_example:
+                continue
+            input_feature = convert_single_example(
+                str(line_id), input_example, label_list, max_seq_length,
+                tokenizer)
+            self.examples.append(input_feature)
+    def __getitem__(self, idx):
+        return self.examples[idx].input_ids, self.examples[
+            idx].pos_ids, self.examples[idx].segment_ids, self.examples[
+                idx].label_id
+    def __len__(self):
+        return len(self.examples)
+def _prepare_train_batch(insts,
+                         vocab_size=0,
+                         pad_id=None,
+                         cls_id=None,
+                         sep_id=None,
+                         mask_id=-1,
+                         return_input_mask=True,
+                         return_max_len=True,
+                         return_num_token=False):
+    return prepare_batch_data(
+        insts,
+        0,
+        voc_size=vocab_size,
+        pad_id=pad_id,
+        cls_id=cls_id,
+        sep_id=sep_id,
+        mask_id=mask_id,
+        return_input_mask=return_input_mask,
+        return_max_len=return_max_len,
+        return_num_token=return_num_token)
+class BertDataLoader(object):
+    def __init__(self,
+                 input_file,
+                 tokenizer,
+                 label_list,
+                 max_seq_length,
+                 batch_size,
+                 shuffle=False,
+                 drop_last=False,
+                 mode="all_in_memory",
+                 leveldb_file="./leveldb",
+                 line_processor=None,
+                 delimiter="\t",
+                 quotechar=None,
+                 device=fluid.CPUPlace(),
+                 num_workers=0,
+                 return_list=True,
+                 phase="train"):
+        assert phase in [
+            "train", "predict", "test"
+        ], "phase of BertDataLoader should be in [train, predict, test], but get %s" % phase
+        self.dataset = SingleSentenceDataset(tokenizer, label_list,
+                                             max_seq_length, mode)
+        if mode == "all_in_memory":
+            self.dataset.load_all_data_in_memory(
+                input_file, label_list, max_seq_length, tokenizer,
+                line_processor, delimiter, quotechar)
+        elif mode == "leveldb":
+            self.dataset.prepare_leveldb(input_file, leveldb_file, label_list,
+                                         max_seq_length, tokenizer,
+                                         line_processor, delimiter, quotechar)
+        else:
+            raise ValueError("mode should be in [all_in_memory, leveldb]")
+        if phase == "train":
+            self.sampler = DistributedBatchSampler(
+                self.dataset, batch_size, shuffle=shuffle, drop_last=drop_last)
+        elif phase == "test" or phase == "predict":
+            self.sampler = BatchSampler(
+                dataset=self.dataset,
+                batch_size=batch_size,
+                shuffle=shuffle,
+                drop_last=drop_last)
+        self.dataloader = DataLoader(
+            dataset=self.dataset,
+            batch_sampler=self.sampler,
+            places=device,
+            collate_fn=partial(
+                _prepare_train_batch,
+                vocab_size=-1,
+                pad_id=tokenizer.vocab["[PAD]"],
+                cls_id=tokenizer.vocab["[CLS]"],
+                sep_id=tokenizer.vocab["[SEP]"],
+                mask_id=-1,
+                return_input_mask=True,
+                return_max_len=False,
+                return_num_token=False),
+            num_workers=num_workers,
+            return_list=return_list)
+if __name__ == "__main__":
+    print("hello world.")
--- a/hapi/text/bert/optimization.py
+++ b/hapi/text/bert/optimization.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Optimization and learning rate scheduling."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
+class ConstantLR(LearningRateDecay):
+    def __init__(self, learning_rate, begin=0, step=1, dtype='float32'):
+        super(ConstantLR, self).__init__(begin, step, dtype)
+        self.learning_rate = learning_rate
+    def step(self):
+        return self.learning_rate
+class LinearDecay(LearningRateDecay):
+    def __init__(self,
+                 learning_rate,
+                 warmup_steps,
+                 decay_steps,
+                 end_learning_rate=0.0001,
+                 power=1.0,
+                 cycle=False,
+                 begin=0,
+                 step=1,
+                 dtype='float32'):
+        super(LinearDecay, self).__init__(begin, step, dtype)
+        self.learning_rate = learning_rate
+        self.warmup_steps = warmup_steps
+        self.decay_steps = decay_steps
+        self.end_learning_rate = end_learning_rate
+        self.power = power
+        self.cycle = cycle
+    def step(self):
+        if self.step_num < self.warmup_steps:
+            decayed_lr = self.learning_rate * (self.step_num /
+                                               self.warmup_steps)
+            decayed_lr = self.create_lr_var(decayed_lr)
+        else:
+            tmp_step_num = self.step_num
+            tmp_decay_steps = self.decay_steps
+            if self.cycle:
+                div_res = fluid.layers.ceil(
+                    self.create_lr_var(tmp_step_num / float(self.decay_steps)))
+                if tmp_step_num == 0:
+                    div_res = self.create_lr_var(1.0)
+                tmp_decay_steps = self.decay_steps * div_res
+            else:
+                tmp_step_num = self.create_lr_var(
+                    tmp_step_num
+                    if tmp_step_num < self.decay_steps else self.decay_steps)
+                decayed_lr = (self.learning_rate - self.end_learning_rate) * \
+                    ((1 - tmp_step_num / tmp_decay_steps) ** self.power) + self.end_learning_rate
+        return decayed_lr
+class Optimizer(object):
+    def __init__(self,
+                 warmup_steps,
+                 num_train_steps,
+                 learning_rate,
+                 model_cls,
+                 weight_decay,
+                 scheduler='linear_warmup_decay',
+                 loss_scaling=1.0,
+                 parameter_list=None):
+        self.warmup_steps = warmup_steps
+        self.num_train_steps = num_train_steps
+        self.learning_rate = learning_rate
+        self.model_cls = model_cls
+        self.weight_decay = weight_decay
+        self.scheduler = scheduler
+        self.loss_scaling = loss_scaling
+        self.parameter_list = parameter_list
+        self.scheduled_lr = 0.0
+        self.optimizer = self.lr_schedule()
+    def lr_schedule(self):
+        if self.warmup_steps > 0:
+            if self.scheduler == 'noam_decay':
+                self.scheduled_lr = fluid.dygraph.NoamDecay(1 / (
+                    self.warmup_steps * (self.learning_rate**2)),
+                                                            self.warmup_steps)
+            elif self.scheduler == 'linear_warmup_decay':
+                self.scheduled_lr = LinearDecay(self.learning_rate,
+                                                self.warmup_steps,
+                                                self.num_train_steps, 0.0)
+            else:
+                raise ValueError("Unkown learning rate scheduler, should be "
+                                 "'noam_decay' or 'linear_warmup_decay'")
+            optimizer = fluid.optimizer.Adam(
+                learning_rate=self.scheduled_lr,
+                parameter_list=self.parameter_list)
+        else:
+            self.scheduled_lr = ConstantLR(self.learning_rate)
+            optimizer = fluid.optimizer.Adam(
+                learning_rate=self.scheduled_lr,
+                parameter_list=self.parameter_list)
+        return optimizer
+    def exclude_from_weight_decay(self, name):
+        if name.find("layer_norm") > -1:
+            return True
+        bias_suffix = ["_bias", "_b", ".b_0"]
+        for suffix in bias_suffix:
+            if name.endswith(suffix):
+                return True
+        return False
+    def state_dict(self):
+        return self.optimizer.state_dict()
+    def set_dict(self, state_dict):
+        return self.optimizer.set_dict(state_dict)
+    def get_opti_var_name_list(self):
+        return self.optimizer.get_opti_var_name_list()
+    def current_step_lr(self):
+        return self.optimizer.current_step_lr()
+    def minimize(self, loss, use_data_parallel=False, model=None):
+        param_list = dict()
+        clip_norm_thres = 1.0
+        #grad_clip = fluid.clip.GradientClipByGlobalNorm(clip_norm_thres)
+        if use_data_parallel:
+            loss = model.scale_loss(loss)
+        loss.backward()
+        if self.weight_decay > 0:
+            for param in self.model_cls.parameters():
+                param_list[param.name] = param * 1.0
+                param_list[param.name].stop_gradient = True
+        if use_data_parallel:
+            assert model is not None
+            model.apply_collective_grads()
+        #_, param_grads = self.optimizer.minimize(loss, grad_clip=grad_clip)
+        _, param_grads = self.optimizer.minimize(loss)
+        if self.weight_decay > 0:
+            for param, grad in param_grads:
+                if self.exclude_from_weight_decay(param.name):
+                    continue
+                if isinstance(self.scheduled_lr.step(), float):
+                    updated_param = param.numpy() - param_list[
+                        param.name].numpy(
+                        ) * self.weight_decay * self.scheduled_lr.step()
+                else:
+                    updated_param = param.numpy(
+                    ) - param_list[param.name].numpy(
+                    ) * self.weight_decay * self.scheduled_lr.step().numpy()
+                updated_param_var = fluid.dygraph.to_variable(updated_param)
+                param = updated_param_var
+                #param = fluid.layers.reshape(x=updated_param_var, shape=list(updated_param_var.shape))
--- a/hapi/text/bert/static_optimization.py
+++ b/hapi/text/bert/static_optimization.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Optimization and learning rate scheduling."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import paddle.fluid as fluid
+from utils.fp16 import create_master_params_grads, master_param_to_train_param, apply_dynamic_loss_scaling
+def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps):
+    """ Applies linear warmup of learning rate from 0 and decay to 0."""
+    with fluid.default_main_program()._lr_schedule_guard():
+        lr = fluid.layers.tensor.create_global_var(
+            shape=[1],
+            value=0.0,
+            dtype='float32',
+            persistable=True,
+            name="scheduled_learning_rate")
+        global_step = fluid.layers.learning_rate_scheduler._decay_step_counter(
+        )
+        with fluid.layers.control_flow.Switch() as switch:
+            with switch.case(global_step < warmup_steps):
+                warmup_lr = learning_rate * (global_step / warmup_steps)
+                fluid.layers.tensor.assign(warmup_lr, lr)
+            with switch.default():
+                decayed_lr = fluid.layers.learning_rate_scheduler.polynomial_decay(
+                    learning_rate=learning_rate,
+                    decay_steps=num_train_steps,
+                    end_learning_rate=0.0,
+                    power=1.0,
+                    cycle=False)
+                fluid.layers.tensor.assign(decayed_lr, lr)
+        return lr
+def optimization(loss,
+                 warmup_steps,
+                 num_train_steps,
+                 learning_rate,
+                 train_program,
+                 startup_prog,
+                 weight_decay,
+                 scheduler='linear_warmup_decay',
+                 use_fp16=False,
+                 use_dynamic_loss_scaling=False,
+                 init_loss_scaling=1.0,
+                 incr_every_n_steps=1000,
+                 decr_every_n_nan_or_inf=2,
+                 incr_ratio=2.0,
+                 decr_ratio=0.8):
+    scheduled_lr, loss_scaling = None, None
+    if scheduler == 'noam_decay':
+        if warmup_steps > 0:
+            scheduled_lr = fluid.layers.learning_rate_scheduler\
+             .noam_decay(1/(warmup_steps *(learning_rate ** 2)),
+           warmup_steps)
+        else:
+            print(
+                "WARNING: noam decay of learning rate should have postive warmup "
+                "steps but given {}, using constant learning rate instead!"
+                .format(warmup_steps))
+            scheduled_lr = fluid.layers.create_global_var(
+                name=fluid.unique_name.generate("learning_rate"),
+                shape=[1],
+                value=learning_rate,
+                dtype='float32',
+                persistable=True)
+    elif scheduler == 'linear_warmup_decay':
+        if warmup_steps > 0:
+            scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps,
+                                               num_train_steps)
+        else:
+            print(
+                "WARNING: linear warmup decay of learning rate should have "
+                "postive warmup steps but given {}, use constant learning rate "
+                "instead!".format(warmup_steps))
+            scheduled_lr = fluid.layers.create_global_var(
+                name=fluid.unique_name.generate("learning_rate"),
+                shape=[1],
+                value=learning_rate,
+                dtype='float32',
+                persistable=True)
+    else:
+        raise ValueError("Unkown learning rate scheduler, should be "
+                         "'noam_decay' or 'linear_warmup_decay'")
+    optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
+    fluid.clip.set_gradient_clip(
+        clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))
+    def exclude_from_weight_decay(param):
+        name = param.name.rstrip(".master")
+        if name.find("layer_norm") > -1:
+            return True
+        bias_suffix = ["_bias", "_b", ".b_0"]
+        for suffix in bias_suffix:
+            if name.endswith(suffix):
+                return True
+        return False
+    param_list = dict()
+    if use_fp16:
+        loss_scaling = fluid.layers.create_global_var(
+            name=fluid.unique_name.generate("loss_scaling"),
+            shape=[1],
+            value=init_loss_scaling,
+            dtype='float32',
+            persistable=True)
+        loss *= loss_scaling
+        param_grads = optimizer.backward(loss)
+        master_param_grads = create_master_params_grads(
+            param_grads, train_program, startup_prog, loss_scaling)
+        if weight_decay > 0:
+            for param, _ in master_param_grads:
+                param_list[param.name] = param * 1.0
+                param_list[param.name].stop_gradient = True
+        if use_dynamic_loss_scaling:
+            apply_dynamic_loss_scaling(
+                loss_scaling, master_param_grads, incr_every_n_steps,
+                decr_every_n_nan_or_inf, incr_ratio, decr_ratio)
+        optimizer.apply_gradients(master_param_grads)
+        if weight_decay > 0:
+            for param, grad in master_param_grads:
+                if exclude_from_weight_decay(param):
+                    continue
+                with param.block.program._optimized_guard(
+                    [param, grad]), fluid.framework.name_scope("weight_decay"):
+                    updated_param = param - param_list[
+                        param.name] * weight_decay * scheduled_lr
+                    fluid.layers.assign(output=param, input=updated_param)
+        master_param_to_train_param(master_param_grads, param_grads,
+                                    train_program)
+    else:
+        if weight_decay > 0:
+            for param in train_program.all_parameters():
+                param_list[param.name] = param * 1.0
+                param_list[param.name].stop_gradient = True
+        _, param_grads = optimizer.minimize(loss)
+        if weight_decay > 0:
+            for param, grad in param_grads:
+                if exclude_from_weight_decay(param):
+                    continue
+                with param.block.program._optimized_guard(
+                    [param, grad]), fluid.framework.name_scope("weight_decay"):
+                    updated_param = param - param_list[
+                        param.name] * weight_decay * scheduled_lr
+                    fluid.layers.assign(output=param, input=updated_param)
+    return scheduled_lr, loss_scaling
--- a/hapi/text/bert/utils/__init__.py
+++ b/hapi/text/bert/utils/__init__.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from hapi.text.bert.utils.args import str2bool as str2bool
+from hapi.text.bert.utils.args import ArgumentGroup as ArgumentGroup
+from hapi.text.bert.utils.args import print_arguments as print_arguments
+from hapi.text.bert.utils.args import check_cuda as check_cuda
+from hapi.text.bert.utils.cards import get_cards as get_cards
+from hapi.text.bert.utils.fp16 import cast_fp16_to_fp32 as cast_fp16_to_fp32
+from hapi.text.bert.utils.fp16 import cast_fp32_to_fp16 as cast_fp32_to_fp16
+from hapi.text.bert.utils.fp16 import copy_to_master_param as copy_to_master_param
+from hapi.text.bert.utils.fp16 import create_master_params_grads as create_master_params_grads
+from hapi.text.bert.utils.fp16 import master_param_to_train_param as master_param_to_train_param
+from hapi.text.bert.utils.init import init_checkpoint as init_checkpoint
+from hapi.text.bert.utils.init import init_pretraining_params as init_pretraining_params
+from hapi.text.bert.utils.init import init_from_static_model as init_from_static_model
--- a/hapi/text/bert/utils/args.py
+++ b/hapi/text/bert/utils/args.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Arguments for configuration."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import six
+import argparse
+import paddle.fluid as fluid
+def str2bool(v):
+    # because argparse does not support to parse "true, False" as python
+    # boolean directly
+    return v.lower() in ("true", "t", "1")
+class ArgumentGroup(object):
+    def __init__(self, parser, title, des):
+        self._group = parser.add_argument_group(title=title, description=des)
+    def add_arg(self, name, type, default, help, **kwargs):
+        type = str2bool if type == bool else type
+        self._group.add_argument(
+            "--" + name,
+            default=default,
+            type=type,
+            help=help + ' Default: %(default)s.',
+            **kwargs)
+def print_arguments(args):
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(six.iteritems(vars(args))):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+def check_cuda(use_cuda, err = \
+    "\nYou can not set use_cuda = True in the model because you are using paddlepaddle-cpu.\n \
+    Please: 1. Install paddlepaddle-gpu to run your models on GPU or 2. Set use_cuda = False to run models on CPU.\n"
+                                                                                                                     ):
+    try:
+        if use_cuda == True and fluid.is_compiled_with_cuda() == False:
+            print(err)
+            sys.exit(1)
+    except Exception as e:
+        pass
--- a/hapi/text/bert/utils/cards.py
+++ b/hapi/text/bert/utils/cards.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+def get_cards():
+    """
+    get gpu cards number
+    """
+    num = 0
+    cards = os.environ.get('CUDA_VISIBLE_DEVICES', '')
+    if cards != '':
+        num = len(cards.split(","))
+    return num
--- a/hapi/text/bert/utils/convert_static_to_dygraph.py
+++ b/hapi/text/bert/utils/convert_static_to_dygraph.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import shutil
+import sys
+import os
+def usage():
+    """
+    usage information
+    """
+    print
+    print("please use command: ")
+    print(
+        "python convert_static_to_dygraph.py input_params_dir output_params_dir"
+    )
+    print
+def convert_static_to_dygraph(static_model_path, dygraph_model_path):
+    """
+    convert paddle static bert model to dygraph model 
+    """
+    def mkdir(path):
+        if not os.path.isdir(path):
+            if os.path.split(path)[0]:
+                mkdir(os.path.split(path)[0])
+        else:
+            return
+        os.mkdir(path)
+    if os.path.exists(dygraph_model_path):
+        shutil.rmtree(dygraph_model_path)
+    mkdir(dygraph_model_path)
+    if not os.path.exists(static_model_path):
+        print("paddle static model path doesn't exist.....")
+        return -1
+    file_list = []
+    for root, dirs, files in os.walk(static_model_path):
+        file_list.extend(files)
+    os.makedirs(os.path.join(dygraph_model_path, "PretrainModelLayer_0"))
+    os.makedirs(
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/BertModelLayer_0"))
+    os.makedirs(
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/PrePostProcessLayer_0"))
+    os.makedirs(
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/BertModelLayer_0/PrePostProcessLayer_0"))
+    #os.chdir(static_model_path)
+    #convert embedding file
+    embedding_type = ["word", "pos", "sent"]
+    for i in range(3):
+        src_name = embedding_type[i] + "_embedding"
+        trg_name = "Embedding_" + str(i) + "." + src_name
+        shutil.copyfile(
+            os.path.join(static_model_path, src_name),
+            os.path.join(dygraph_model_path,
+                         "PretrainModelLayer_0/BertModelLayer_0/" + trg_name))
+    #convert pre_encoder file
+    shutil.copyfile(
+        os.path.join(static_model_path, "pre_encoder_layer_norm_scale"),
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/BertModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_scale"
+        ))
+    shutil.copyfile(
+        os.path.join(static_model_path, "pre_encoder_layer_norm_bias"),
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/BertModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_bias"
+        ))
+    #convert mask lm params file
+    shutil.copyfile(
+        os.path.join(static_model_path, "mask_lm_out_fc.b_0"),
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/Layer_0.mask_lm_out_fc.b_0"))
+    shutil.copyfile(
+        os.path.join(static_model_path, "mask_lm_trans_fc.b_0"),
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/FC_0.mask_lm_trans_fc.b_0"))
+    shutil.copyfile(
+        os.path.join(static_model_path, "mask_lm_trans_fc.w_0"),
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/FC_0.mask_lm_trans_fc.w_0"))
+    shutil.copyfile(
+        os.path.join(static_model_path, "mask_lm_trans_layer_norm_bias"),
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_bias"
+        ))
+    shutil.copyfile(
+        os.path.join(static_model_path, "mask_lm_trans_layer_norm_scale"),
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/PrePostProcessLayer_0/LayerNorm_0._layer_norm_scale"
+        ))
+    shutil.copyfile(
+        os.path.join(static_model_path, "next_sent_fc.b_0"),
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/FC_1.next_sent_fc.b_0"))
+    shutil.copyfile(
+        os.path.join(static_model_path, "next_sent_fc.w_0"),
+        os.path.join(dygraph_model_path,
+                     "PretrainModelLayer_0/FC_1.next_sent_fc.w_0"))
+    shutil.copyfile(
+        os.path.join(static_model_path, "pooled_fc.b_0"),
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/BertModelLayer_0/FC_0.pooled_fc.b_0"))
+    shutil.copyfile(
+        os.path.join(static_model_path, "pooled_fc.w_0"),
+        os.path.join(
+            dygraph_model_path,
+            "PretrainModelLayer_0/BertModelLayer_0/FC_0.pooled_fc.w_0"))
+    encoder_num = 0
+    for f in file_list:
+        if not f.startswith("encoder_layer"):
+            continue
+        layer_num = f.split('_')[2]
+        if int(layer_num) > encoder_num:
+            encoder_num = int(layer_num)
+    encoder_num += 1
+    for i in range(encoder_num):
+        encoder_dir = "EncoderSubLayer_" + str(i)
+        os.makedirs(
+            os.path.join(dygraph_model_path,
+                         "PretrainModelLayer_0/BertModelLayer_0/" +
+                         "EncoderLayer_0/", encoder_dir))
+        os.makedirs(
+            os.path.join(dygraph_model_path,
+                         "PretrainModelLayer_0/BertModelLayer_0/" +
+                         "EncoderLayer_0/", encoder_dir +
+                         "/PositionwiseFeedForwardLayer_0"))
+        os.makedirs(
+            os.path.join(
+                dygraph_model_path, "PretrainModelLayer_0/BertModelLayer_0/" +
+                "EncoderLayer_0/", encoder_dir + "/MultiHeadAttentionLayer_0"))
+        os.makedirs(
+            os.path.join(
+                dygraph_model_path, "PretrainModelLayer_0/BertModelLayer_0/" +
+                "EncoderLayer_0/", encoder_dir + "/PrePostProcessLayer_1"))
+        os.makedirs(
+            os.path.join(
+                dygraph_model_path, "PretrainModelLayer_0/BertModelLayer_0/" +
+                "EncoderLayer_0/", encoder_dir + "/PrePostProcessLayer_3"))
+    encoder_map_dict = {
+        "ffn_fc_0.b_0":
+        ("PositionwiseFeedForwardLayer_0", "FC_0.ffn_fc_0.b_0"),
+        "ffn_fc_0.w_0":
+        ("PositionwiseFeedForwardLayer_0", "FC_0.ffn_fc_0.w_0"),
+        "ffn_fc_1.b_0":
+        ("PositionwiseFeedForwardLayer_0", "FC_1.ffn_fc_1.b_0"),
+        "ffn_fc_1.w_0":
+        ("PositionwiseFeedForwardLayer_0", "FC_1.ffn_fc_1.w_0"),
+        "multi_head_att_key_fc.b_0":
+        ("MultiHeadAttentionLayer_0", "FC_1.key_fc.b_0"),
+        "multi_head_att_key_fc.w_0":
+        ("MultiHeadAttentionLayer_0", "FC_1.key_fc.w_0"),
+        "multi_head_att_output_fc.b_0":
+        ("MultiHeadAttentionLayer_0", "FC_3.output_fc.b_0"),
+        "multi_head_att_output_fc.w_0":
+        ("MultiHeadAttentionLayer_0", "FC_3.output_fc.w_0"),
+        "multi_head_att_query_fc.b_0":
+        ("MultiHeadAttentionLayer_0", "FC_0.query_fc.b_0"),
+        "multi_head_att_query_fc.w_0":
+        ("MultiHeadAttentionLayer_0", "FC_0.query_fc.w_0"),
+        "multi_head_att_value_fc.b_0":
+        ("MultiHeadAttentionLayer_0", "FC_2.value_fc.b_0"),
+        "multi_head_att_value_fc.w_0":
+        ("MultiHeadAttentionLayer_0", "FC_2.value_fc.w_0"),
+        "post_att_layer_norm_bias":
+        ("PrePostProcessLayer_1", "LayerNorm_0.post_att_layer_norm_bias"),
+        "post_att_layer_norm_scale":
+        ("PrePostProcessLayer_1", "LayerNorm_0.post_att_layer_norm_scale"),
+        "post_ffn_layer_norm_bias":
+        ("PrePostProcessLayer_3", "LayerNorm_0.post_ffn_layer_norm_bias"),
+        "post_ffn_layer_norm_scale":
+        ("PrePostProcessLayer_3", "LayerNorm_0.post_ffn_layer_norm_scale")
+    }
+    for f in file_list:
+        if not f.startswith("encoder_layer"):
+            continue
+        layer_num = f.split('_')[2]
+        suffix_name = "_".join(f.split('_')[3:])
+        in_dir = encoder_map_dict[suffix_name][0]
+        rename = encoder_map_dict[suffix_name][1]
+        encoder_layer = "EncoderSubLayer_" + layer_num
+        shutil.copyfile(
+            os.path.join(static_model_path, f),
+            os.path.join(
+                dygraph_model_path,
+                "PretrainModelLayer_0/BertModelLayer_0/EncoderLayer_0/" +
+                encoder_layer + "/" + in_dir + "/" + rename))
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        usage()
+        exit(1)
+    static_model_path = sys.argv[1]
+    dygraph_model_path = sys.argv[2]
+    convert_static_to_dygraph(static_model_path, dygraph_model_path)
--- a/hapi/text/bert/utils/fp16.py
+++ b/hapi/text/bert/utils/fp16.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+def cast_fp16_to_fp32(i, o, prog):
+    prog.global_block().append_op(
+        type="cast",
+        inputs={"X": i},
+        outputs={"Out": o},
+        attrs={
+            "in_dtype": fluid.core.VarDesc.VarType.FP16,
+            "out_dtype": fluid.core.VarDesc.VarType.FP32
+        })
+def cast_fp32_to_fp16(i, o, prog):
+    prog.global_block().append_op(
+        type="cast",
+        inputs={"X": i},
+        outputs={"Out": o},
+        attrs={
+            "in_dtype": fluid.core.VarDesc.VarType.FP32,
+            "out_dtype": fluid.core.VarDesc.VarType.FP16
+        })
+def copy_to_master_param(p, block):
+    v = block.vars.get(p.name, None)
+    if v is None:
+        raise ValueError("no param name %s found!" % p.name)
+    new_p = fluid.framework.Parameter(
+        block=block,
+        shape=v.shape,
+        dtype=fluid.core.VarDesc.VarType.FP32,
+        type=v.type,
+        lod_level=v.lod_level,
+        stop_gradient=p.stop_gradient,
+        trainable=p.trainable,
+        optimize_attr=p.optimize_attr,
+        regularizer=p.regularizer,
+        gradient_clip_attr=p.gradient_clip_attr,
+        error_clip=p.error_clip,
+        name=v.name + ".master")
+    return new_p
+def create_master_params_grads(params_grads, main_prog, startup_prog,
+                               loss_scaling):
+    master_params_grads = []
+    tmp_role = main_prog._current_role
+    OpRole = fluid.core.op_proto_and_checker_maker.OpRole
+    main_prog._current_role = OpRole.Backward
+    for p, g in params_grads:
+        # create master parameters
+        master_param = copy_to_master_param(p, main_prog.global_block())
+        startup_master_param = startup_prog.global_block()._clone_variable(
+            master_param)
+        startup_p = startup_prog.global_block().var(p.name)
+        cast_fp16_to_fp32(startup_p, startup_master_param, startup_prog)
+        # cast fp16 gradients to fp32 before apply gradients
+        if g.name.find("layer_norm") > -1:
+            if loss_scaling > 1:
+                scaled_g = g / float(loss_scaling)
+            else:
+                scaled_g = g
+            master_params_grads.append([p, scaled_g])
+            continue
+        master_grad = fluid.layers.cast(g, "float32")
+        if loss_scaling > 1:
+            master_grad = master_grad / float(loss_scaling)
+        master_params_grads.append([master_param, master_grad])
+    main_prog._current_role = tmp_role
+    return master_params_grads
+def master_param_to_train_param(master_params_grads, params_grads, main_prog):
+    for idx, m_p_g in enumerate(master_params_grads):
+        train_p, _ = params_grads[idx]
+        if train_p.name.find("layer_norm") > -1:
+            continue
+        with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]):
+            cast_fp32_to_fp16(m_p_g[0], train_p, main_prog)
--- a/hapi/text/bert/utils/init.py
+++ b/hapi/text/bert/utils/init.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import os
+import six
+import ast
+import copy
+import numpy as np
+import paddle.fluid as fluid
+def cast_fp32_to_fp16(exe, main_program):
+    print("Cast parameters to float16 data format.")
+    for param in main_program.global_block().all_parameters():
+        if not param.name.endswith(".master"):
+            param_t = fluid.global_scope().find_var(param.name).get_tensor()
+            data = np.array(param_t)
+            if param.name.find("layer_norm") == -1:
+                param_t.set(np.float16(data).view(np.uint16), exe.place)
+            master_param_var = fluid.global_scope().find_var(param.name +
+                                                             ".master")
+            if master_param_var is not None:
+                master_param_var.get_tensor().set(data, exe.place)
+def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False):
+    assert os.path.exists(
+        init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path
+    def existed_persitables(var):
+        if not fluid.io.is_persistable(var):
+            return False
+        return os.path.exists(os.path.join(init_checkpoint_path, var.name))
+    fluid.io.load_vars(
+        exe,
+        init_checkpoint_path,
+        main_program=main_program,
+        predicate=existed_persitables)
+    print("Load model from {}".format(init_checkpoint_path))
+    if use_fp16:
+        cast_fp32_to_fp16(exe, main_program)
+def init_pretraining_params(exe,
+                            pretraining_params_path,
+                            main_program,
+                            use_fp16=False):
+    assert os.path.exists(pretraining_params_path
+                          ), "[%s] cann't be found." % pretraining_params_path
+    def existed_params(var):
+        if not isinstance(var, fluid.framework.Parameter):
+            return False
+        return os.path.exists(os.path.join(pretraining_params_path, var.name))
+    fluid.io.load_vars(
+        exe,
+        pretraining_params_path,
+        main_program=main_program,
+        predicate=existed_params)
+    print("Load pretraining parameters from {}.".format(
+        pretraining_params_path))
+    if use_fp16:
+        cast_fp32_to_fp16(exe, main_program)
+def init_from_static_model(dir_path,
+                           backbone_model,
+                           bert_config,
+                           verbose=False):
+    def load_numpy_weight(file_name):
+        if six.PY2:
+            res = np.load(os.path.join(dir_path, file_name), allow_pickle=True)
+        else:
+            res = np.load(
+                os.path.join(dir_path, file_name),
+                allow_pickle=True,
+                encoding='latin1')
+        assert res is not None
+        return res
+    # load word embedding
+    _param = load_numpy_weight("word_embedding")
+    backbone_model._src_emb.set_dict({"weight": _param})
+    if verbose:
+        print("INIT word embedding")
+    _param = load_numpy_weight("pos_embedding")
+    backbone_model._pos_emb.set_dict({"weight": _param})
+    if verbose:
+        print("INIT pos embedding")
+    _param = load_numpy_weight("sent_embedding")
+    backbone_model._sent_emb.set_dict({"weight": _param})
+    if verbose:
+        print("INIT sent embedding")
+    _param0 = load_numpy_weight("pooled_fc.w_0")
+    _param1 = load_numpy_weight("pooled_fc.b_0")
+    backbone_model.pooled_fc.set_dict({"weight": _param0, "bias": _param1})
+    if verbose:
+        print("INIT pooled_fc")
+    _param0 = load_numpy_weight("pre_encoder_layer_norm_scale")
+    _param1 = load_numpy_weight("pre_encoder_layer_norm_bias")
+    backbone_model.pre_process_layer._sub_layers["layer_norm_0"].set_dict({
+        "weight": _param0,
+        "bias": _param1
+    })
+    if verbose:
+        print("INIT pre_encoder layer norm")
+    for _i in range(bert_config["num_hidden_layers"]):
+        _param_weight = "encoder_layer_%d_multi_head_att_query_fc.w_0" % _i
+        _param_bias = "encoder_layer_%d_multi_head_att_query_fc.b_0" % _i
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+        backbone_model._encoder._sub_layers["layer_%d" %
+                                            _i].self_attn.q_fc.set_dict({
+                                                "weight": _param_weight,
+                                                "bias": _param_bias
+                                            })
+        if verbose:
+            print("INIT multi_head_att_query_fc %d" % _i)
+        _param_weight = "encoder_layer_%d_multi_head_att_key_fc.w_0" % _i
+        _param_bias = "encoder_layer_%d_multi_head_att_key_fc.b_0" % _i
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+        backbone_model._encoder._sub_layers["layer_%d" %
+                                            _i].self_attn.k_fc.set_dict({
+                                                "weight": _param_weight,
+                                                "bias": _param_bias
+                                            })
+        if verbose:
+            print("INIT multi_head_att_key_fc %d" % _i)
+        _param_weight = "encoder_layer_%d_multi_head_att_value_fc.w_0" % _i
+        _param_bias = "encoder_layer_%d_multi_head_att_value_fc.b_0" % _i
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+        backbone_model._encoder._sub_layers["layer_%d" %
+                                            _i].self_attn.v_fc.set_dict({
+                                                "weight": _param_weight,
+                                                "bias": _param_bias
+                                            })
+        if verbose:
+            print("INIT multi_head_att_value_fc %d" % _i)
+        # init output fc
+        _param_weight = "encoder_layer_%d_multi_head_att_output_fc.w_0" % _i
+        _param_bias = "encoder_layer_%d_multi_head_att_output_fc.b_0" % _i
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+        backbone_model._encoder._sub_layers["layer_%d" %
+                                            _i].self_attn.proj_fc.set_dict({
+                                                "weight": _param_weight,
+                                                "bias": _param_bias
+                                            })
+        if verbose:
+            print("INIT multi_head_att_output_fc %d" % _i)
+        # init layer_norm 1
+        _param_weight = "encoder_layer_%d_post_att_layer_norm_scale" % _i
+        _param_bias = "encoder_layer_%d_post_att_layer_norm_bias" % _i
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+        backbone_model._encoder._sub_layers[
+            "layer_%d" % _i].postprocesser1.layer_norm_0.set_dict({
+                "weight": _param_weight,
+                "bias": _param_bias
+            })
+        if verbose:
+            print("INIT layer norm in attention at %d layer" % _i)
+        # init layer_norm 2
+        _param_weight = "encoder_layer_%d_post_ffn_layer_norm_scale" % _i
+        _param_bias = "encoder_layer_%d_post_ffn_layer_norm_bias" % _i
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+        backbone_model._encoder._sub_layers[
+            "layer_%d" % _i].postprocesser2.layer_norm_0.set_dict({
+                "weight": _param_weight,
+                "bias": _param_bias
+            })
+        if verbose:
+            print("INIT layer norm in FFN at %d layer" % _i)
+        # init FFN 1
+        _param_weight = "encoder_layer_%d_ffn_fc_0.w_0" % _i
+        _param_bias = "encoder_layer_%d_ffn_fc_0.b_0" % _i
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+        backbone_model._encoder._sub_layers["layer_%d" % _i].ffn.fc1.set_dict({
+            "weight": _param_weight,
+            "bias": _param_bias
+        })
+        if verbose:
+            print("INIT FFN-1 at %d layer" % _i)
+        # init FFN 2
+        _param_weight = "encoder_layer_%d_ffn_fc_1.w_0" % _i
+        _param_bias = "encoder_layer_%d_ffn_fc_1.b_0" % _i
+        _param_weight = load_numpy_weight(_param_weight)
+        _param_bias = load_numpy_weight(_param_bias)
+        backbone_model._encoder._sub_layers["layer_%d" % _i].ffn.fc2.set_dict({
+            "weight": _param_weight,
+            "bias": _param_bias
+        })
+        if verbose:
+            print("INIT FFN-2 at %d layer" % _i)
+    return True
--- a/text.py
+++ b/text.py
-import collections
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-import copy
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
 import six
 import sys
+if six.PY2:
+    reload(sys)
+    sys.setdefaultencoding('utf8')
+import ast
+import time
+import argparse as argparse
+import numpy as np
+import multiprocessing
+import collections
+import copy
 from functools import partial, reduce
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers.utils as utils
 from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
-from paddle.fluid.dygraph import to_variable, Embedding, Linear, LayerNorm
+from paddle.fluid.dygraph import to_variable, Embedding, Linear, LayerNorm, GRUUnit
 from paddle.fluid.data_feeder import convert_dtype
 from paddle.fluid import layers
@@ -19,7 +48,8 @@ __all__ = [
    'RNNCell', 'BasicLSTMCell', 'BasicGRUCell', 'RNN', 'DynamicDecode',
    'BeamSearchDecoder', 'MultiHeadAttention', 'FFN',
    'TransformerEncoderLayer', 'TransformerEncoder', 'TransformerDecoderLayer',
-    'TransformerDecoder', 'TransformerBeamSearchDecoder'
+    'TransformerDecoder', 'TransformerBeamSearchDecoder', 'Linear_chain_crf',
+    'Crf_decoding', 'SequenceTagging'
 ]
@@ -188,7 +218,19 @@ class BasicLSTMCell(RNNCell):
                 gate_activation=None,
                 activation=None,
                 forget_bias=1.0,
-                 dtype='float32'):
+                 dtype='float32',
+                 forget_gate_weights={"w": None,
+                                      "h": None,
+                                      "b": None},
+                 input_gate_weights={"w": None,
+                                     "h": None,
+                                     "b": None},
+                 output_gate_weights={"w": None,
+                                      "h": None,
+                                      "b": None},
+                 cell_weights={"w": None,
+                               "h": None,
+                               "b": None}):
        super(BasicLSTMCell, self).__init__()
        self._hidden_size = hidden_size
@@ -202,25 +244,225 @@ class BasicLSTMCell(RNNCell):
        self._dtype = dtype
        self._input_size = input_size
-        self._weight = self.create_parameter(
+        self.use_customized_weight = False
-            attr=self._param_attr,
+        for _weights in [
-            shape=[
+                forget_gate_weights, input_gate_weights, output_gate_weights,
-                self._input_size + self._hidden_size, 4 * self._hidden_size
+                cell_weights
-            ],
+        ]:
-            dtype=self._dtype)
+            for _key in _weights:
+                if _weights[_key] is not None:
-        self._bias = self.create_parameter(
+                    self.use_customized_weight = True
-            attr=self._bias_attr,
+                    break
-            shape=[4 * self._hidden_size],
+            if self.use_customized_weight:
-            dtype=self._dtype,
+                break
-            is_bias=True)
+        if not self.use_customized_weight:
+            self._weight = self.create_parameter(
+                attr=self._param_attr,
+                shape=[
+                    self._input_size + self._hidden_size, 4 * self._hidden_size
+                ],
+                dtype=self._dtype)
+            self._bias = self.create_parameter(
+                attr=self._bias_attr,
+                shape=[4 * self._hidden_size],
+                dtype=self._dtype,
+                is_bias=True)
+        else:
+            if "w" in forget_gate_weights and forget_gate_weights[
+                    "w"] is not None:
+                self.fg_w = forget_gate_weights["w"]
+            else:
+                if self._param_attr is not None and self._param_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(self._param_attr)
+                    tmp_param_attr.name += "_forget_gate_w"
+                else:
+                    tmp_param_attr = self._param_attr
+                self.fg_w = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._input_size, self._hidden_size],
+                    dtype=self._dtype)
+            if "h" in forget_gate_weights and forget_gate_weights[
+                    "h"] is not None:
+                self.fg_h = forget_gate_weights["h"]
+            else:
+                if self._param_attr is not None and self._param_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(self._param_attr)
+                    tmp_param_attr.name += "_forget_gate_h"
+                else:
+                    tmp_param_attr = self._param_attr
+                self.fg_h = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._hidden_size, self._hidden_size],
+                    dtype=self._dtype)
+            if "b" in forget_gate_weights and forget_gate_weights[
+                    "b"] is not None:
+                self.fg_b = forget_gate_weights["b"]
+            else:
+                if self._bias_attr is not None and self._bias_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(self._bias_attr)
+                    tmp_param_attr.name += "_forget_gate_b"
+                else:
+                    tmp_param_attr = self._bias_attr
+                self.fg_b = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._hidden_size],
+                    dtype=self._dtype,
+                    is_bias=True)
+            if "w" in input_gate_weights and input_gate_weights[
+                    "w"] is not None:
+                self.ig_w = input_gate_weights["w"]
+            else:
+                if self._param_attr is not None and self._param_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(self._param_attr)
+                    tmp_param_attr.name += "_input_gate_w"
+                else:
+                    tmp_param_attr = self._param_attr
+                self.ig_w = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._input_size, self._hidden_size],
+                    dtype=self._dtype)
+            if "h" in input_gate_weights and input_gate_weights[
+                    "h"] is not None:
+                self.ig_h = input_gate_weights["h"]
+            else:
+                if self._param_attr is not None and self._param_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(self._param_attr)
+                    tmp_param_attr.name += "_input_gate_h"
+                else:
+                    tmp_param_attr = self._param_attr
+                self.ig_h = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._hidden_size, self._hidden_size],
+                    dtype=self._dtype)
+            if "b" in input_gate_weights and input_gate_weights[
+                    "b"] is not None:
+                self.ig_b = input_gate_weights["b"]
+            else:
+                if self._bias_attr is not None and self._bias_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(self._bias_attr)
+                    tmp_param_attr.name += "_input_gate_b"
+                else:
+                    tmp_param_attr = self._bias_attr
+                self.ig_b = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._hidden_size],
+                    dtype=self._dtype,
+                    is_bias=True)
+            if "w" in output_gate_weights and output_gate_weights[
+                    "w"] is not None:
+                self.og_w = output_gate_weights["w"]
+            else:
+                if self._param_attr is not None and self._param_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(self._param_attr)
+                    tmp_param_attr.name += "_output_gate_w"
+                else:
+                    tmp_param_attr = self._param_attr
+                self.og_w = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._input_size, self._hidden_size],
+                    dtype=self._dtype)
+            if "h" in output_gate_weights and output_gate_weights[
+                    "h"] is not None:
+                self.og_h = output_gate_weights["h"]
+            else:
+                if self._param_attr is not None and self._param_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(self._param_attr)
+                    tmp_param_attr.name += "_output_gate_h"
+                else:
+                    tmp_param_attr = self._param_attr
+                self.og_h = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._hidden_size, self._hidden_size],
+                    dtype=self._dtype)
+            if "b" in output_gate_weights and output_gate_weights[
+                    "b"] is not None:
+                self.og_b = output_gate_weights["b"]
+            else:
+                if self._bias_attr is not None and self._bias_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(self._bias_attr)
+                    tmp_param_attr.name += "_output_gate_b"
+                else:
+                    tmp_param_attr = self._bias_attr
+                self.og_b = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._hidden_size],
+                    dtype=self._dtype,
+                    is_bias=True)
+            if "w" in cell_weights and cell_weights["w"] is not None:
+                self.c_w = cell_weights["w"]
+            else:
+                if self._param_attr is not None and self._param_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(self._param_attr)
+                    tmp_param_attr.name += "_cell_w"
+                else:
+                    tmp_param_attr = self._param_attr
+                self.c_w = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._input_size, self._hidden_size],
+                    dtype=self._dtype)
+            if "h" in cell_weights and cell_weights["h"] is not None:
+                self.c_h = cell_weights["h"]
+            else:
+                if self._param_attr is not None and self._param_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(self._param_attr)
+                    tmp_param_attr.name += "_cell_h"
+                else:
+                    tmp_param_attr = self._param_attr
+                self.c_h = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._hidden_size, self._hidden_size],
+                    dtype=self._dtype)
+            if "b" in cell_weights and cell_weights["b"] is not None:
+                self.c_b = cell_weights["b"]
+            else:
+                if self._bias_attr is not None and self._bias_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(self._bias_attr)
+                    tmp_param_attr.name += "_cell_b"
+                else:
+                    tmp_param_attr = self._bias_attr
+                self.c_b = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._hidden_size],
+                    dtype=self._dtype,
+                    is_bias=True)
    def forward(self, input, state):
+        if self.use_customized_weight:
+            weight_w = fluid.layers.concat(
+                [self.ig_w, self.c_w, self.fg_w, self.og_w], axis=-1)
+            weight_h = fluid.layers.concat(
+                [self.ig_h, self.c_h, self.fg_h, self.og_h], axis=-1)
+            _weight = fluid.layers.concat([weight_w, weight_h], axis=0)
+            _bias = fluid.layers.concat(
+                [self.ig_b, self.c_b, self.fg_b, self.og_b])
+        else:
+            _weight = self._weight
+            _bias = self._bias
        pre_hidden, pre_cell = state
        concat_input_hidden = layers.concat([input, pre_hidden], 1)
-        gate_input = layers.matmul(x=concat_input_hidden, y=self._weight)
+        gate_input = layers.matmul(x=concat_input_hidden, y=_weight)
-        gate_input = layers.elementwise_add(gate_input, self._bias)
+        gate_input = layers.elementwise_add(gate_input, _bias)
        i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
        new_cell = layers.elementwise_add(
            layers.elementwise_mul(
@@ -277,16 +519,39 @@ class BasicGRUCell(RNNCell):
                 bias_attr=None,
                 gate_activation=None,
                 activation=None,
-                 dtype='float32'):
+                 dtype='float32',
+                 update_gate_weights={"w": None,
+                                      "h": None,
+                                      "b": None},
+                 reset_gate_weights={"w": None,
+                                     "h": None,
+                                     "b": None},
+                 cell_weights={"w": None,
+                               "h": None,
+                               "b": None}):
        super(BasicGRUCell, self).__init__()
        self._input_size = input_size
-        self._hiden_size = hidden_size
+        self._hidden_size = hidden_size
        self._param_attr = param_attr
        self._bias_attr = bias_attr
        self._gate_activation = gate_activation or layers.sigmoid
        self._activation = activation or layers.tanh
        self._dtype = dtype
+        assert isinstance(update_gate_weights, dict)
+        assert isinstance(reset_gate_weights, dict)
+        assert isinstance(cell_weights, dict)
+        self.use_customized_weight = False
+        for _weights in [
+                update_gate_weights, reset_gate_weights, cell_weights
+        ]:
+            for _key in _weights:
+                if _weights[_key] is not None:
+                    self.use_customized_weight = True
+            if self.use_customized_weight:
+                break
        if self._param_attr is not None and self._param_attr.name is not None:
            gate_param_attr = copy.deepcopy(self._param_attr)
            candidate_param_attr = copy.deepcopy(self._param_attr)
@@ -296,43 +561,194 @@ class BasicGRUCell(RNNCell):
            gate_param_attr = self._param_attr
            candidate_param_attr = self._param_attr
-        self._gate_weight = self.create_parameter(
+        if not self.use_customized_weight:
-            attr=gate_param_attr,
+            self._gate_weight = self.create_parameter(
-            shape=[self._input_size + self._hiden_size, 2 * self._hiden_size],
+                attr=gate_param_attr,
-            dtype=self._dtype)
+                shape=[
+                    self._input_size + self._hidden_size, 2 * self._hidden_size
-        self._candidate_weight = self.create_parameter(
+                ],
-            attr=candidate_param_attr,
+                dtype=self._dtype)
-            shape=[self._input_size + self._hiden_size, self._hiden_size],
-            dtype=self._dtype)
+            self._candidate_weight = self.create_parameter(
+                attr=candidate_param_attr,
+                shape=[
+                    self._input_size + self._hidden_size, self._hidden_size
+                ],
+                dtype=self._dtype)
+            if self._bias_attr is not None and self._bias_attr.name is not None:
+                gate_bias_attr = copy.deepcopy(self._bias_attr)
+                candidate_bias_attr = copy.deepcopy(self._bias_attr)
+                gate_bias_attr.name += "_gate"
+                candidate_bias_attr.name += "_candidate"
+            else:
+                gate_bias_attr = self._bias_attr
+                candidate_bias_attr = self._bias_attr
+            self._gate_bias = self.create_parameter(
+                attr=gate_bias_attr,
+                shape=[2 * self._hidden_size],
+                dtype=self._dtype,
+                is_bias=True)
+            self._candidate_bias = self.create_parameter(
+                attr=candidate_bias_attr,
+                shape=[self._hidden_size],
+                dtype=self._dtype,
+                is_bias=True)
-        if self._bias_attr is not None and self._bias_attr.name is not None:
-            gate_bias_attr = copy.deepcopy(self._bias_attr)
-            candidate_bias_attr = copy.deepcopy(self._bias_attr)
-            gate_bias_attr.name += "_gate"
-            candidate_bias_attr.name += "_candidate"
        else:
-            gate_bias_attr = self._bias_attr
-            candidate_bias_attr = self._bias_attr
+            # create the parameters of gates in gru
+            if "w" in update_gate_weights and update_gate_weights[
-        self._gate_bias = self.create_parameter(
+                    "w"] is not None:
-            attr=gate_bias_attr,
+                self.ug_w = update_gate_weights["w"]
-            shape=[2 * self._hiden_size],
+            else:
-            dtype=self._dtype,
+                if gate_param_attr is not None and gate_param_attr.name is not None:
-            is_bias=True)
+                    tmp_param_attr = copy.deepcopy(gate_param_attr)
-        self._candidate_bias = self.create_parameter(
+                    tmp_param_attr.name += "_update_gate_w"
-            attr=candidate_bias_attr,
+                else:
-            shape=[self._hiden_size],
+                    tmp_param_attr = gate_param_attr
-            dtype=self._dtype,
+                self.ug_w = self.create_parameter(
-            is_bias=True)
+                    attr=tmp_param_attr,
+                    shape=[self._input_size, self._hidden_size],
+                    dtype=self._dtype)
+            if "h" in update_gate_weights and update_gate_weights[
+                    "h"] is not None:
+                self.ug_h = update_gate_weights["h"]
+            else:
+                if gate_param_attr is not None and gate_param_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(gate_param_attr)
+                    tmp_param_attr.name += "_update_gate_h"
+                else:
+                    tmp_param_attr = gate_param_attr
+                self.ug_h = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._hidden_size, self._hidden_size],
+                    dtype=self._dtype)
+            if "b" in update_gate_weights and update_gate_weights[
+                    "b"] is not None:
+                self.ug_b = update_gate_weights["b"]
+            else:
+                if gate_bias_attr is not None and gate_bias_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(gate_bias_attr)
+                    tmp_param_attr.name += "_update_gate_b"
+                else:
+                    tmp_param_attr = gate_bias_attr
+                self.ug_b = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._hidden_size],
+                    dtype=self._dtype,
+                    is_bias=True)
+            # reset gate parameters
+            if "w" in reset_gate_weights and reset_gate_weights[
+                    "w"] is not None:
+                self.rg_w = reset_gate_weights["w"]
+            else:
+                if gate_param_attr is not None and gate_param_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(gate_param_attr)
+                    tmp_param_attr.name += "_reset_gate_w"
+                else:
+                    tmp_param_attr = gate_param_attr
+                self.rg_w = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._input_size, self._hidden_size],
+                    dtype=self._dtype)
+            if "h" in reset_gate_weights and reset_gate_weights[
+                    "h"] is not None:
+                self.rg_h = reset_gate_weights["h"]
+            else:
+                if gate_param_attr is not None and gate_param_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(gate_param_attr)
+                    tmp_param_attr.name += "_reset_gate_h"
+                else:
+                    tmp_param_attr = gate_param_attr
+                self.rg_h = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._hidden_size, self._hidden_size],
+                    dtype=self._dtype)
+            if "b" in reset_gate_weights and reset_gate_weights[
+                    "b"] is not None:
+                self.rg_b = reused_params["b"]
+            else:
+                if gate_bias_attr is not None and gate_bias_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(gate_bias_attr)
+                    tmp_param_attr.name += "_reset_gate_b"
+                else:
+                    tmp_param_attr = gate_bias_attr
+                self.rg_b = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._hidden_size],
+                    dtype=self._dtype,
+                    is_bias=True)
+            # cell parameters
+            if "w" in cell_weights and cell_weights["w"] is not None:
+                self.c_w = cell_weights["w"]
+            else:
+                if candidate_param_attr is not None and candidate_param_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(candidate_param_attr)
+                    tmp_param_attr.name += "_cell_w"
+                else:
+                    tmp_param_attr = gate_param_attr
+                self.c_w = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._input_size, self._hidden_size],
+                    dtype=self._dtype)
+            if "h" in cell_weights and cell_weights["h"] is not None:
+                self.c_h = cell_weights["h"]
+            else:
+                if candidate_param_attr is not None and candidate_param_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(candidate_param_attr)
+                    tmp_param_attr.name += "_cell_h"
+                else:
+                    tmp_param_attr = gate_param_attr
+                self.c_h = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._hidden_size, self._hidden_size],
+                    dtype=self._dtype)
+            if "b" in cell_weights and cell_weights["b"] is not None:
+                self.c_b = cell_weights["b"]
+            else:
+                if candidate_bias_attr is not None and candidate_bias_attr.name is not None:
+                    tmp_param_attr = copy.deepcopy(candidate_bias_attr)
+                    tmp_param_attr.name += "_cell_b"
+                else:
+                    tmp_param_attr = gate_bias_attr
+                self.c_b = self.create_parameter(
+                    attr=tmp_param_attr,
+                    shape=[self._hidden_size],
+                    dtype=self._dtype,
+                    is_bias=True)
    def forward(self, input, state):
+        if self.use_customized_weight:
+            rg_weights = layers.concat([self.rg_w, self.rg_h], axis=0)
+            ug_weights = layers.concat([self.ug_w, self.ug_h], axis=0)
+            _gate_weight = layers.concat([rg_weights, ug_weights], axis=-1)
+            _candidate_weight = layers.concat([self.c_w, self.c_h], axis=0)
+            _gate_bias = layers.concat([self.rg_b, self.ug_b], axis=0)
+            _candidate_bias = self.c_b
+        else:
+            _gate_weight = self._gate_weight
+            _gate_bias = self._gate_bias
+            _candidate_weight = self._candidate_weight
+            _candidate_bias = self._candidate_bias
        pre_hidden = state
        concat_input_hidden = layers.concat([input, pre_hidden], axis=1)
-        gate_input = layers.matmul(x=concat_input_hidden, y=self._gate_weight)
+        gate_input = layers.matmul(x=concat_input_hidden, y=_gate_weight)
-        gate_input = layers.elementwise_add(gate_input, self._gate_bias)
+        gate_input = layers.elementwise_add(gate_input, _gate_bias)
        gate_input = self._gate_activation(gate_input)
        r, u = layers.split(gate_input, num_or_sections=2, dim=1)
@@ -340,8 +756,8 @@ class BasicGRUCell(RNNCell):
        r_hidden = r * pre_hidden
        candidate = layers.matmul(
-            layers.concat([input, r_hidden], 1), self._candidate_weight)
+            layers.concat([input, r_hidden], 1), _candidate_weight)
-        candidate = layers.elementwise_add(candidate, self._candidate_bias)
+        candidate = layers.elementwise_add(candidate, _candidate_bias)
        c = self._activation(candidate)
        new_hidden = u * pre_hidden + (1 - u) * c
@@ -669,7 +1085,11 @@ class PrePostProcessLayer(Layer):
    PrePostProcessLayer
    """
-    def __init__(self, process_cmd, d_model, dropout_rate):
+    def __init__(self,
+                 process_cmd,
+                 d_model,
+                 dropout_rate,
+                 reused_layer_norm=None):
        super(PrePostProcessLayer, self).__init__()
        self.process_cmd = process_cmd
        self.functors = []
@@ -677,16 +1097,21 @@ class PrePostProcessLayer(Layer):
            if cmd == "a":  # add residual connection
                self.functors.append(lambda x, y: x + y if y else x)
            elif cmd == "n":  # add layer normalization
+                if reused_layer_norm is not None:
+                    layer_norm = reused_layer_norm
+                else:
+                    layer_norm = LayerNorm(
+                        normalized_shape=d_model,
+                        param_attr=fluid.ParamAttr(
+                            initializer=fluid.initializer.Constant(1.)),
+                        bias_attr=fluid.ParamAttr(
+                            initializer=fluid.initializer.Constant(0.)))
                self.functors.append(
                    self.add_sublayer(
                        "layer_norm_%d" % len(
                            self.sublayers(include_sublayers=False)),
-                        LayerNorm(
+                        layer_norm))
-                            normalized_shape=d_model,
-                            param_attr=fluid.ParamAttr(
-                                initializer=fluid.initializer.Constant(1.)),
-                            bias_attr=fluid.ParamAttr(
-                                initializer=fluid.initializer.Constant(0.)))))
            elif cmd == "d":  # add dropout
                self.functors.append(lambda x: layers.dropout(
                    x, dropout_prob=dropout_rate, is_test=False)
@@ -706,21 +1131,48 @@ class MultiHeadAttention(Layer):
    Multi-Head Attention
    """
-    def __init__(self, d_key, d_value, d_model, n_head=1, dropout_rate=0.):
+    def __init__(self,
+                 d_key,
+                 d_value,
+                 d_model,
+                 n_head=1,
+                 dropout_rate=0.0,
+                 reused_query_fc=None,
+                 reused_key_fc=None,
+                 reused_value_fc=None,
+                 reused_proj_fc=None):
        super(MultiHeadAttention, self).__init__()
        self.n_head = n_head
        self.d_key = d_key
        self.d_value = d_value
        self.d_model = d_model
        self.dropout_rate = dropout_rate
-        self.q_fc = Linear(
-            input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
+        if reused_query_fc is not None:
-        self.k_fc = Linear(
+            self.q_fc = reused_query_fc
-            input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
+        else:
-        self.v_fc = Linear(
+            self.q_fc = Linear(
-            input_dim=d_model, output_dim=d_value * n_head, bias_attr=False)
+                input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
-        self.proj_fc = Linear(
+        if reused_key_fc is not None:
-            input_dim=d_value * n_head, output_dim=d_model, bias_attr=False)
+            self.k_fc = reused_key_fc
+        else:
+            self.k_fc = Linear(
+                input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
+        if reused_value_fc is not None:
+            self.v_fc = reused_value_fc
+        else:
+            self.v_fc = Linear(
+                input_dim=d_model,
+                output_dim=d_value * n_head,
+                bias_attr=False)
+        if reused_proj_fc is not None:
+            self.proj_fc = reused_proj_fc
+        else:
+            self.proj_fc = Linear(
+                input_dim=d_value * n_head,
+                output_dim=d_model,
+                bias_attr=False)
    def _prepare_qkv(self, queries, keys, values, cache=None):
        if keys is None:  # self-attention
@@ -797,12 +1249,24 @@ class FFN(Layer):
    Feed-Forward Network
    """
-    def __init__(self, d_inner_hid, d_model, dropout_rate):
+    def __init__(self,
+                 d_inner_hid,
+                 d_model,
+                 dropout_rate,
+                 fc1_act="relu",
+                 reused_fc1=None,
+                 reused_fc2=None):
        super(FFN, self).__init__()
        self.dropout_rate = dropout_rate
-        self.fc1 = Linear(
+        if reused_fc1 is not None:
-            input_dim=d_model, output_dim=d_inner_hid, act="relu")
+            self.fc1 = reused_fc1
-        self.fc2 = Linear(input_dim=d_inner_hid, output_dim=d_model)
+        else:
+            self.fc1 = Linear(
+                input_dim=d_model, output_dim=d_inner_hid, act=fc1_act)
+        if reused_fc2 is not None:
+            self.fc2 = reused_fc2
+        else:
+            self.fc2 = Linear(input_dim=d_inner_hid, output_dim=d_model)
    def forward(self, x):
        hidden = self.fc1(x)
@@ -828,22 +1292,52 @@ class TransformerEncoderLayer(Layer):
                 attention_dropout,
                 relu_dropout,
                 preprocess_cmd="n",
-                 postprocess_cmd="da"):
+                 postprocess_cmd="da",
+                 ffn_fc1_act="relu",
+                 reused_pre_selatt_layernorm=None,
+                 reused_multihead_att_weights={
+                     "reused_query_fc": None,
+                     "reused_key_fc": None,
+                     "reused_value_fc": None,
+                     "reused_proj_fc": None
+                 },
+                 reused_post_selfatt_layernorm=None,
+                 reused_pre_ffn_layernorm=None,
+                 reused_ffn_weights={"reused_fc1": None,
+                                     "reused_fc2": None},
+                 reused_post_ffn_layernorm=None):
        super(TransformerEncoderLayer, self).__init__()
        self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout)
+                                                 prepostprocess_dropout,
-        self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
+                                                 reused_pre_selatt_layernorm)
-                                            attention_dropout)
+        self.self_attn = MultiHeadAttention(
-        self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model,
+            d_key,
-                                                  prepostprocess_dropout)
+            d_value,
+            d_model,
+            n_head,
+            attention_dropout,
+            reused_query_fc=reused_multihead_att_weights["reused_query_fc"],
+            reused_key_fc=reused_multihead_att_weights["reused_key_fc"],
+            reused_value_fc=reused_multihead_att_weights["reused_value_fc"],
+            reused_proj_fc=reused_multihead_att_weights["reused_proj_fc"])
+        self.postprocesser1 = PrePostProcessLayer(
+            postprocess_cmd, d_model, prepostprocess_dropout,
+            reused_post_selfatt_layernorm)
        self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout)
+                                                 prepostprocess_dropout,
-        self.ffn = FFN(d_inner_hid, d_model, relu_dropout)
+                                                 reused_pre_ffn_layernorm)
+        self.ffn = FFN(d_inner_hid,
+                       d_model,
+                       relu_dropout,
+                       fc1_act=ffn_fc1_act,
+                       reused_fc1=reused_ffn_weights["reused_fc1"],
+                       reused_fc2=reused_ffn_weights["reused_fc2"])
        self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
-                                                  prepostprocess_dropout)
+                                                  prepostprocess_dropout,
+                                                  reused_post_ffn_layernorm)
    def forward(self, enc_input, attn_bias):
        attn_output = self.self_attn(
@@ -871,7 +1365,8 @@ class TransformerEncoder(Layer):
                 attention_dropout,
                 relu_dropout,
                 preprocess_cmd="n",
-                 postprocess_cmd="da"):
+                 postprocess_cmd="da",
+                 ffn_fc1_act="relu"):
        super(TransformerEncoder, self).__init__()
@@ -881,9 +1376,17 @@ class TransformerEncoder(Layer):
                self.add_sublayer(
                    "layer_%d" % i,
                    TransformerEncoderLayer(
-                        n_head, d_key, d_value, d_model, d_inner_hid,
+                        n_head,
-                        prepostprocess_dropout, attention_dropout,
+                        d_key,
-                        relu_dropout, preprocess_cmd, postprocess_cmd)))
+                        d_value,
+                        d_model,
+                        d_inner_hid,
+                        prepostprocess_dropout,
+                        attention_dropout,
+                        relu_dropout,
+                        preprocess_cmd,
+                        postprocess_cmd,
+                        ffn_fc1_act=ffn_fc1_act)))
        self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
                                             prepostprocess_dropout)
@@ -910,28 +1413,79 @@ class TransformerDecoderLayer(Layer):
                 attention_dropout,
                 relu_dropout,
                 preprocess_cmd="n",
-                 postprocess_cmd="da"):
+                 postprocess_cmd="da",
+                 reused_pre_selfatt_layernorm=None,
+                 reused_self_multihead_att_weights={
+                     "reused_query_fc": None,
+                     "reused_key_fc": None,
+                     "reused_value_fc": None,
+                     "reused_proj_fc": None
+                 },
+                 reused_post_selfatt_layernorm=None,
+                 reused_pre_crossatt_layernorm=None,
+                 reused_cross_multihead_att_weights={
+                     "reused_query_fc": None,
+                     "reused_key_fc": None,
+                     "reused_value_fc": None,
+                     "reused_proj_fc": None
+                 },
+                 reused_post_crossatt_layernorm=None,
+                 reused_pre_ffn_layernorm=None,
+                 reused_ffn_weights={"reused_fc1": None,
+                                     "reused_fc2": None},
+                 reused_post_ffn_layernorm=None):
        super(TransformerDecoderLayer, self).__init__()
        self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout)
+                                                 prepostprocess_dropout,
-        self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
+                                                 reused_pre_selfatt_layernorm)
-                                            attention_dropout)
+        self.self_attn = MultiHeadAttention(
-        self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model,
+            d_key,
-                                                  prepostprocess_dropout)
+            d_value,
+            d_model,
+            n_head,
+            attention_dropout,
+            reused_query_fc=reused_self_multihead_att_weights[
+                "reused_query_fc"],
+            reused_key_fc=reused_self_multihead_att_weights["reused_key_fc"],
+            reused_value_fc=reused_self_multihead_att_weights[
+                "reused_value_fc"],
+            reused_proj_fc=reused_self_multihead_att_weights["reused_proj_fc"])
+        self.postprocesser1 = PrePostProcessLayer(
+            postprocess_cmd, d_model, prepostprocess_dropout,
+            reused_post_selfatt_layernorm)
        self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout)
+                                                 prepostprocess_dropout,
-        self.cross_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
+                                                 reused_pre_crossatt_layernorm)
-                                             attention_dropout)
+        self.cross_attn = MultiHeadAttention(
-        self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
+            d_key,
-                                                  prepostprocess_dropout)
+            d_value,
+            d_model,
+            n_head,
+            attention_dropout,
+            reused_query_fc=reused_cross_multihead_att_weights[
+                "reused_query_fc"],
+            reused_key_fc=reused_cross_multihead_att_weights["reused_key_fc"],
+            reused_value_fc=reused_cross_multihead_att_weights[
+                "reused_value_fc"],
+            reused_proj_fc=reused_cross_multihead_att_weights[
+                "reused_proj_fc"])
+        self.postprocesser2 = PrePostProcessLayer(
+            postprocess_cmd, d_model, prepostprocess_dropout,
+            reused_post_crossatt_layernorm)
        self.preprocesser3 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout)
+                                                 prepostprocess_dropout,
-        self.ffn = FFN(d_inner_hid, d_model, relu_dropout)
+                                                 reused_pre_ffn_layernorm)
+        self.ffn = FFN(d_inner_hid,
+                       d_model,
+                       relu_dropout,
+                       reused_fc1=reused_ffn_weights["reused_fc1"],
+                       reused_fc2=reused_ffn_weights["reused_fc2"])
        self.postprocesser3 = PrePostProcessLayer(postprocess_cmd, d_model,
-                                                  prepostprocess_dropout)
+                                                  prepostprocess_dropout,
+                                                  reused_post_ffn_layernorm)
    def forward(self,
                dec_input,
@@ -998,3 +1552,304 @@ class TransformerDecoder(Layer):
                    decoder_layer.cross_attn.cal_kv(enc_output, enc_output)))
            for decoder_layer in self.decoder_layers
        ]
+#TODO: we should merge GRUCell with BasicGRUCell
+class GRUCell(RNNCell):
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 param_attr=None,
+                 bias_attr=None,
+                 gate_activation='sigmoid',
+                 candidate_activation='tanh',
+                 origin_mode=False):
+        super(GRUCell, self).__init__()
+        self.hidden_size = hidden_size
+        self.fc_layer = Linear(
+            input_size, hidden_size * 3, param_attr=param_attr)
+        self.gru_unit = GRUUnit(
+            hidden_size * 3,
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            activation=candidate_activation,
+            gate_activation=gate_activation,
+            origin_mode=origin_mode)
+    def forward(self, inputs, states):
+        # for GRUCell, `step_outputs` and `new_states` both are hidden
+        x = self.fc_layer(inputs)
+        hidden, _, _ = self.gru_unit(x, states)
+        return hidden, hidden
+    @property
+    def state_shape(self):
+        return [self.hidden_size]
+#TODO: we should merge GRUCell with BasicGRUCell
+class GRUEncoderCell(RNNCell):
+    def __init__(self,
+                 num_layers,
+                 input_size,
+                 hidden_size,
+                 dropout_prob=0.,
+                 init_scale=0.1):
+        super(GRUEncoderCell, self).__init__()
+        self.dropout_prob = dropout_prob
+        # use add_sublayer to add multi-layers
+        self.gru_cells = []
+        for i in range(num_layers):
+            self.gru_cells.append(
+                self.add_sublayer(
+                    "gru_%d" % i,
+                    #BasicGRUCell(
+                    GRUCell(
+                        input_size=input_size if i == 0 else hidden_size,
+                        hidden_size=hidden_size,
+                        param_attr=fluid.ParamAttr(
+                            initializer=fluid.initializer.UniformInitializer(
+                                low=-init_scale, high=init_scale)))))
+    def forward(self, step_input, states):
+        new_states = []
+        for i, gru_cell in enumerate(self.gru_cells):
+            out, state = gru_cell(step_input, states[i])
+            step_input = layers.dropout(
+                out,
+                self.dropout_prob,
+                dropout_implementation='upscale_in_train'
+            ) if self.dropout_prob > 0 else out
+            new_states.append(step_input)
+        return step_input, new_states
+    @property
+    def state_shape(self):
+        return [cell.state_shape for cell in self.gru_cells]
+class BiGRU(fluid.dygraph.Layer):
+    def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None):
+        super(BiGRU, self).__init__()
+        self.gru = RNN(GRUEncoderCell(1, input_dim, grnn_hidden_dim, 0.0,
+                                      init_bound),
+                       is_reverse=False,
+                       time_major=False)
+        self.gru_r = RNN(GRUEncoderCell(1, input_dim, grnn_hidden_dim, 0.0,
+                                        init_bound),
+                         is_reverse=True,
+                         time_major=False)
+    def forward(self, input_feature):
+        pre_gru, pre_state = self.gru(input_feature)
+        gru_r, r_state = self.gru_r(input_feature)
+        bi_merge = fluid.layers.concat(input=[pre_gru, gru_r], axis=-1)
+        return bi_merge
+class Linear_chain_crf(fluid.dygraph.Layer):
+    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
+        super(Linear_chain_crf, self).__init__()
+        self._param_attr = param_attr
+        self._dtype = dtype
+        self._size = size
+        self._is_test = is_test
+        self._transition = self.create_parameter(
+            attr=self._param_attr,
+            shape=[self._size + 2, self._size],
+            dtype=self._dtype)
+    @property
+    def weight(self):
+        return self._transition
+    @weight.setter
+    def weight(self, value):
+        self._transition = value
+    def forward(self, input, label, length=None):
+        alpha = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+        emission_exps = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+        transition_exps = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+        log_likelihood = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+        this_inputs = {
+            "Emission": [input],
+            "Transition": self._transition,
+            "Label": [label]
+        }
+        if length is not None:
+            this_inputs['Length'] = [length]
+        self._helper.append_op(
+            type='linear_chain_crf',
+            inputs=this_inputs,
+            outputs={
+                "Alpha": [alpha],
+                "EmissionExps": [emission_exps],
+                "TransitionExps": transition_exps,
+                "LogLikelihood": log_likelihood
+            },
+            attrs={"is_test": self._is_test, })
+        return log_likelihood
+class Crf_decoding(fluid.dygraph.Layer):
+    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
+        super(Crf_decoding, self).__init__()
+        self._dtype = dtype
+        self._size = size
+        self._is_test = is_test
+        self._param_attr = param_attr
+        self._transition = self.create_parameter(
+            attr=self._param_attr,
+            shape=[self._size + 2, self._size],
+            dtype=self._dtype)
+    @property
+    def weight(self):
+        return self._transition
+    @weight.setter
+    def weight(self, value):
+        self._transition = value
+    def forward(self, input, label=None, length=None):
+        viterbi_path = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+        this_inputs = {
+            "Emission": [input],
+            "Transition": self._transition,
+            "Label": label
+        }
+        if length is not None:
+            this_inputs['Length'] = [length]
+        self._helper.append_op(
+            type='crf_decoding',
+            inputs=this_inputs,
+            outputs={"ViterbiPath": [viterbi_path]},
+            attrs={"is_test": self._is_test, })
+        return viterbi_path
+class SequenceTagging(fluid.dygraph.Layer):
+    def __init__(self,
+                 vocab_size,
+                 num_labels,
+                 batch_size,
+                 word_emb_dim=128,
+                 grnn_hidden_dim=128,
+                 emb_learning_rate=0.1,
+                 crf_learning_rate=0.1,
+                 bigru_num=2,
+                 init_bound=0.1,
+                 length=None):
+        super(SequenceTagging, self).__init__()
+        """
+        define the sequence tagging network structure
+        word: stores the input of the model
+        for_infer: a boolean value, indicating if the model to be created is for training or predicting.
+        return:
+            for infer: return the prediction
+            otherwise: return the prediction
+        """
+        self.word_emb_dim = word_emb_dim
+        self.vocab_size = vocab_size
+        self.num_labels = num_labels
+        self.grnn_hidden_dim = grnn_hidden_dim
+        self.emb_lr = emb_learning_rate
+        self.crf_lr = crf_learning_rate
+        self.bigru_num = bigru_num
+        self.batch_size = batch_size
+        self.init_bound = 0.1
+        self.word_embedding = Embedding(
+            size=[self.vocab_size, self.word_emb_dim],
+            dtype='float32',
+            param_attr=fluid.ParamAttr(
+                learning_rate=self.emb_lr,
+                name="word_emb",
+                initializer=fluid.initializer.Uniform(
+                    low=-self.init_bound, high=self.init_bound)))
+        h_0 = fluid.layers.create_global_var(
+            shape=[self.batch_size, self.grnn_hidden_dim],
+            value=0.0,
+            dtype='float32',
+            persistable=True,
+            force_cpu=True,
+            name='h_0')
+        self.bigru_units = []
+        for i in range(self.bigru_num):
+            if i == 0:
+                self.bigru_units.append(
+                    self.add_sublayer(
+                        "bigru_units%d" % i,
+                        BiGRU(
+                            self.grnn_hidden_dim,
+                            self.grnn_hidden_dim,
+                            self.init_bound,
+                            h_0=h_0)))
+            else:
+                self.bigru_units.append(
+                    self.add_sublayer(
+                        "bigru_units%d" % i,
+                        BiGRU(
+                            self.grnn_hidden_dim * 2,
+                            self.grnn_hidden_dim,
+                            self.init_bound,
+                            h_0=h_0)))
+        self.fc = Linear(
+            input_dim=self.grnn_hidden_dim * 2,
+            output_dim=self.num_labels,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Uniform(
+                    low=-self.init_bound, high=self.init_bound),
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=1e-4)))
+        self.linear_chain_crf = Linear_chain_crf(
+            param_attr=fluid.ParamAttr(
+                name='linear_chain_crfw', learning_rate=self.crf_lr),
+            size=self.num_labels)
+        self.crf_decoding = Crf_decoding(
+            param_attr=fluid.ParamAttr(
+                name='crfw', learning_rate=self.crf_lr),
+            size=self.num_labels)
+    def forward(self, word, lengths, target=None):
+        """
+        Configure the network
+        """
+        word_embed = self.word_embedding(word)
+        input_feature = word_embed
+        for i in range(self.bigru_num):
+            bigru_output = self.bigru_units[i](input_feature)
+            input_feature = bigru_output
+        emission = self.fc(bigru_output)
+        if target is not None:
+            crf_cost = self.linear_chain_crf(
+                input=emission, label=target, length=lengths)
+            avg_cost = fluid.layers.mean(x=crf_cost)
+            self.crf_decoding.weight = self.linear_chain_crf.weight
+            crf_decode = self.crf_decoding(input=emission, length=lengths)
+            return crf_decode, avg_cost, lengths
+        else:
+            self.linear_chain_crf.weight = self.crf_decoding.weight
+            crf_decode = self.crf_decoding(input=emission, length=lengths)
+            return crf_decode, lengths
--- a/hapi/text/tokenizer/__init__.py
+++ b/hapi/text/tokenizer/__init__.py
--- a/hapi/text/tokenizer/tokenization.py
+++ b/hapi/text/tokenizer/tokenization.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import collections
+import unicodedata
+import six
+import io
+def convert_to_unicode(text):
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text.decode("utf-8", "ignore")
+        elif isinstance(text, unicode):
+            return text
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+def printable_text(text):
+    """Returns text encoded in a way suitable for print or `tf.logging`."""
+    # These functions want `str` for both Python2 and Python3, but in one case
+    # it's a Unicode string and in the other it's a byte string.
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, unicode):
+            return text.encode("utf-8")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    fin = io.open(vocab_file, encoding="utf8")
+    for num, line in enumerate(fin):
+        items = convert_to_unicode(line.strip()).split("\t")
+        if len(items) > 2:
+            break
+        token = items[0]
+        index = items[1] if len(items) == 2 else num
+        token = token.strip()
+        vocab[token] = int(index)
+    return vocab
+def convert_by_vocab(vocab, items):
+    """Converts a sequence of [tokens|ids] using the vocab."""
+    output = []
+    for item in items:
+        output.append(vocab[item])
+    return output
+def convert_tokens_to_ids(vocab, tokens):
+    return convert_by_vocab(vocab, tokens)
+def convert_ids_to_tokens(inv_vocab, ids):
+    return convert_by_vocab(inv_vocab, ids)
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a peice of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+class FullTokenizer(object):
+    """Runs end-to-end tokenziation."""
+    def __init__(self, vocab_file, do_lower_case=True):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+    def tokenize(self, text):
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+        return split_tokens
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+class CharTokenizer(object):
+    """Runs end-to-end tokenziation."""
+    def __init__(self, vocab_file, do_lower_case=True):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+    def tokenize(self, text):
+        split_tokens = []
+        for token in text.lower().split(" "):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+        return split_tokens
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+    def __init__(self, do_lower_case=True):
+        """Constructs a BasicTokenizer.
+        Args:
+            do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = convert_to_unicode(text)
+        text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+        return ["".join(x) for x in output]
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #     https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+            (cp >= 0x3400 and cp <= 0x4DBF) or  #
+            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+            (cp >= 0x2B820 and cp <= 0x2CEAF) or
+            (cp >= 0xF900 and cp <= 0xFAFF) or  #
+            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+        return False
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenziation."""
+    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+        For example:
+            input = "unaffable"
+            output = ["un", "##aff", "##able"]
+        Args:
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through `BasicTokenizer.
+        Returns:
+            A list of wordpiece tokens.
+        """
+        text = convert_to_unicode(text)
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+        (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
--- a/hapi/vision/__init__.py
+++ b/hapi/vision/__init__.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from . import models
+from . import transforms
+__all__ = ["models", "transforms"]
--- a/models/__init__.py
+++ b/models/__init__.py
@@ -17,21 +17,15 @@ from . import vgg
 from . import mobilenetv1
 from . import mobilenetv2
 from . import darknet
-from . import yolov3
-from . import tsm
 from .resnet import *
 from .mobilenetv1 import *
 from .mobilenetv2 import *
 from .vgg import *
 from .darknet import *
-from .yolov3 import *
-from .tsm import *
 __all__ = resnet.__all__ \
        + vgg.__all__ \
        + mobilenetv1.__all__ \
        + mobilenetv2.__all__ \
-        + darknet.__all__ \
+        + darknet.__all__
-        + yolov3.__all__ \
-        + tsm.__all__
--- a/models/darknet.py
+++ b/models/darknet.py
@@ -18,10 +18,10 @@ from paddle.fluid.regularizer import L2Decay
 from paddle.fluid.dygraph.nn import Conv2D, BatchNorm
-from model import Model
+from hapi.model import Model
-from .download import get_weights_path
+from hapi.download import get_weights_path
-__all__ = ['DarkNet53', 'ConvBNLayer', 'darknet53']
+__all__ = ['DarkNet', 'darknet53']
 # {num_layers: (url, md5)}
 pretrain_infos = {
@@ -136,9 +136,17 @@ class LayerWarp(fluid.dygraph.Layer):
 DarkNet_cfg = {53: ([1, 2, 8, 8, 4])}
-class DarkNet53(Model):
+class DarkNet(Model):
+    """DarkNet model from
+    `"YOLOv3: An Incremental Improvement" <https://arxiv.org/abs/1804.02767>`_
+    Args:
+        num_layers (int): layer number of DarkNet, only 53 supported currently, default: 53.
+        ch_in (int): channel number of input data, default 3.
+    """
    def __init__(self, num_layers=53, ch_in=3):
-        super(DarkNet53, self).__init__()
+        super(DarkNet, self).__init__()
        assert num_layers in DarkNet_cfg.keys(), \
            "only support num_layers in {} currently" \
            .format(DarkNet_cfg.keys())
@@ -188,7 +196,7 @@ class DarkNet53(Model):
 def _darknet(num_layers=53, input_channels=3, pretrained=True):
-    model = DarkNet53(num_layers, input_channels)
+    model = DarkNet(num_layers, input_channels)
    if pretrained:
        assert num_layers in pretrain_infos.keys(), \
                "DarkNet{} do not have pretrained weights now, " \
@@ -201,4 +209,11 @@ def _darknet(num_layers=53, input_channels=3, pretrained=True):
 def darknet53(input_channels=3, pretrained=True):
+    """DarkNet 53-layer model
+    Args:
+        input_channels (bool): channel number of input data, default 3. 
+        pretrained (bool): If True, returns a model pre-trained on ImageNet,
+            default True.
+    """
    return _darknet(53, input_channels, pretrained)
--- a/models/mobilenetv1.py
+++ b/models/mobilenetv1.py
@@ -19,8 +19,8 @@ from paddle.fluid.initializer import MSRA
 from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
-from model import Model
+from hapi.model import Model
-from .download import get_weights_path
+from hapi.download import get_weights_path
 __all__ = ['MobileNetV1', 'mobilenet_v1']
@@ -111,13 +111,22 @@ class MobileNetV1(Model):
    Args:
        scale (float): scale of channels in each layer. Default: 1.0.
-        class_dim (int): output dim of last fc layer. Default: 1000.
+        num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
+                            will not be defined. Default: 1000.
+        with_pool (bool): use pool before the last fc layer or not. Default: True.
+        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
    """
-    def __init__(self, scale=1.0, class_dim=1000):
+    def __init__(self,
+                 scale=1.0,
+                 num_classes=1000,
+                 with_pool=True,
+                 classifier_activation='softmax'):
        super(MobileNetV1, self).__init__()
        self.scale = scale
        self.dwsl = []
+        self.num_classes = num_classes
+        self.with_pool = with_pool
        self.conv1 = ConvBNLayer(
            num_channels=3,
@@ -227,23 +236,29 @@ class MobileNetV1(Model):
            name="conv6")
        self.dwsl.append(dws6)
-        self.pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
+        if with_pool:
+            self.pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
-        self.out = Linear(
+        if num_classes > -1:
-            int(1024 * scale),
+            self.out = Linear(
-            class_dim,
+                int(1024 * scale),
-            act='softmax',
+                num_classes,
-            param_attr=ParamAttr(
+                act=classifier_activation,
-                initializer=MSRA(), name=self.full_name() + "fc7_weights"),
+                param_attr=ParamAttr(
-            bias_attr=ParamAttr(name="fc7_offset"))
+                    initializer=MSRA(), name=self.full_name() + "fc7_weights"),
+                bias_attr=ParamAttr(name="fc7_offset"))
    def forward(self, inputs):
        y = self.conv1(inputs)
        for dws in self.dwsl:
            y = dws(y)
-        y = self.pool2d_avg(y)
-        y = fluid.layers.reshape(y, shape=[-1, 1024])
+        if self.with_pool:
-        y = self.out(y)
+            y = self.pool2d_avg(y)
+        if self.num_classes > 0:
+            y = fluid.layers.reshape(y, shape=[-1, 1024])
+            y = self.out(y)
        return y
@@ -261,6 +276,13 @@ def _mobilenet(arch, pretrained=False, **kwargs):
    return model
-def mobilenet_v1(pretrained=False, scale=1.0):
+def mobilenet_v1(pretrained=False, scale=1.0, **kwargs):
-    model = _mobilenet('mobilenetv1_' + str(scale), pretrained, scale=scale)
+    """MobileNetV1
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
+        scale: (float): scale of channels in each layer. Default: 1.0.
+    """
+    model = _mobilenet(
+        'mobilenetv1_' + str(scale), pretrained, scale=scale, **kwargs)
    return model
--- a/models/mobilenetv2.py
+++ b/models/mobilenetv2.py
@@ -18,8 +18,8 @@ import paddle.fluid as fluid
 from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
-from model import Model
+from hapi.model import Model
-from .download import get_weights_path
+from hapi.download import get_weights_path
 __all__ = ['MobileNetV2', 'mobilenet_v2']
@@ -156,13 +156,21 @@ class MobileNetV2(Model):
    Args:
        scale (float): scale of channels in each layer. Default: 1.0.
-        class_dim (int): output dim of last fc layer. Default: 1000.
+        num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
+                            will not be defined. Default: 1000.
+        with_pool (bool): use pool before the last fc layer or not. Default: True.
+        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
    """
-    def __init__(self, scale=1.0, class_dim=1000):
+    def __init__(self,
+                 scale=1.0,
+                 num_classes=1000,
+                 with_pool=True,
+                 classifier_activation='softmax'):
        super(MobileNetV2, self).__init__()
        self.scale = scale
-        self.class_dim = class_dim
+        self.num_classes = num_classes
+        self.with_pool = with_pool
        bottleneck_params_list = [
            (1, 16, 1, 1),
@@ -174,7 +182,6 @@ class MobileNetV2(Model):
            (6, 320, 1, 1),
        ]
-        #1. conv1 
        self._conv1 = ConvBNLayer(
            num_channels=3,
            num_filters=int(32 * scale),
@@ -182,7 +189,6 @@ class MobileNetV2(Model):
            stride=2,
            padding=1)
-        #2. bottleneck sequences
        self._invl = []
        i = 1
        in_c = int(32 * scale)
@@ -196,7 +202,6 @@ class MobileNetV2(Model):
            self._invl.append(tmp)
            in_c = int(c * scale)
-        #3. last_conv
        self._out_c = int(1280 * scale) if scale > 1.0 else 1280
        self._conv9 = ConvBNLayer(
            num_channels=in_c,
@@ -205,26 +210,29 @@ class MobileNetV2(Model):
            stride=1,
            padding=0)
-        #4. pool
+        if with_pool:
-        self._pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
+            self._pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
-        #5. fc
+        if num_classes > 0:
-        tmp_param = ParamAttr(name=self.full_name() + "fc10_weights")
+            tmp_param = ParamAttr(name=self.full_name() + "fc10_weights")
-        self._fc = Linear(
+            self._fc = Linear(
-            self._out_c,
+                self._out_c,
-            class_dim,
+                num_classes,
-            act='softmax',
+                act=classifier_activation,
-            param_attr=tmp_param,
+                param_attr=tmp_param,
-            bias_attr=ParamAttr(name="fc10_offset"))
+                bias_attr=ParamAttr(name="fc10_offset"))
    def forward(self, inputs):
        y = self._conv1(inputs, if_act=True)
        for inv in self._invl:
            y = inv(y)
        y = self._conv9(y, if_act=True)
-        y = self._pool2d_avg(y)
-        y = fluid.layers.reshape(y, shape=[-1, self._out_c])
+        if self.with_pool:
-        y = self._fc(y)
+            y = self._pool2d_avg(y)
+        if self.num_classes > 0:
+            y = fluid.layers.reshape(y, shape=[-1, self._out_c])
+            y = self._fc(y)
        return y
@@ -242,11 +250,13 @@ def _mobilenet(arch, pretrained=False, **kwargs):
    return model
-def mobilenet_v2(pretrained=False, scale=1.0):
+def mobilenet_v2(pretrained=False, scale=1.0, **kwargs):
    """MobileNetV2
    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
+        scale: (float): scale of channels in each layer. Default: 1.0.
    """
-    model = _mobilenet('mobilenetv2_' + str(scale), pretrained, scale=scale)
+    model = _mobilenet(
+        'mobilenetv2_' + str(scale), pretrained, scale=scale, **kwargs)
    return model
--- a/models/resnet.py
+++ b/models/resnet.py
@@ -22,16 +22,26 @@ from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
 from paddle.fluid.dygraph.container import Sequential
-from model import Model
+from hapi.model import Model
-from .download import get_weights_path
+from hapi.download import get_weights_path
 __all__ = [
    'ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152'
 ]
 model_urls = {
+    'resnet18': ('https://paddle-hapi.bj.bcebos.com/models/resnet18.pdparams',
+                 '0ba53eea9bc970962d0ef96f7b94057e'),
+    'resnet34': ('https://paddle-hapi.bj.bcebos.com/models/resnet34.pdparams',
+                 '46bc9f7c3dd2e55b7866285bee91eff3'),
    'resnet50': ('https://paddle-hapi.bj.bcebos.com/models/resnet50.pdparams',
-                 '0884c9087266496c41c60d14a96f8530')
+                 '0884c9087266496c41c60d14a96f8530'),
+    'resnet101':
+    ('https://paddle-hapi.bj.bcebos.com/models/resnet101.pdparams',
+     'fb07a451df331e4b0bb861ed97c3a9b9'),
+    'resnet152':
+    ('https://paddle-hapi.bj.bcebos.com/models/resnet152.pdparams',
+     'f9c700f26d3644bb76ad2226ed5f5713'),
 }
@@ -163,12 +173,23 @@ class ResNet(Model):
    Args:
        Block (BasicBlock|BottleneckBlock): block module of model.
        depth (int): layers of resnet, default: 50.
-        num_classes (int): output dim of last fc layer, default: 1000.
+        num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
+                            will not be defined. Default: 1000.
+        with_pool (bool): use pool before the last fc layer or not. Default: True.
+        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
    """
-    def __init__(self, Block, depth=50, num_classes=1000):
+    def __init__(self,
+                 Block,
+                 depth=50,
+                 num_classes=1000,
+                 with_pool=True,
+                 classifier_activation='softmax'):
        super(ResNet, self).__init__()
+        self.num_classes = num_classes
+        self.with_pool = with_pool
        layer_config = {
            18: [2, 2, 2, 2],
            34: [3, 4, 6, 3],
@@ -212,31 +233,37 @@ class ResNet(Model):
                                      Sequential(*blocks))
            self.layers.append(layer)
-        self.global_pool = Pool2D(
+        if with_pool:
-            pool_size=7, pool_type='avg', global_pooling=True)
+            self.global_pool = Pool2D(
+                pool_size=7, pool_type='avg', global_pooling=True)
-        stdv = 1.0 / math.sqrt(out_channels[-1] * Block.expansion * 1.0)
+        if num_classes > 0:
-        self.fc_input_dim = out_channels[-1] * Block.expansion * 1 * 1
+            stdv = 1.0 / math.sqrt(out_channels[-1] * Block.expansion * 1.0)
-        self.fc = Linear(
+            self.fc_input_dim = out_channels[-1] * Block.expansion * 1 * 1
-            self.fc_input_dim,
+            self.fc = Linear(
-            num_classes,
+                self.fc_input_dim,
-            act='softmax',
+                num_classes,
-            param_attr=fluid.param_attr.ParamAttr(
+                act=classifier_activation,
-                initializer=fluid.initializer.Uniform(-stdv, stdv)))
+                param_attr=fluid.param_attr.ParamAttr(
+                    initializer=fluid.initializer.Uniform(-stdv, stdv)))
    def forward(self, inputs):
        x = self.conv(inputs)
        x = self.pool(x)
        for layer in self.layers:
            x = layer(x)
-        x = self.global_pool(x)
-        x = fluid.layers.reshape(x, shape=[-1, self.fc_input_dim])
+        if self.with_pool:
-        x = self.fc(x)
+            x = self.global_pool(x)
+        if self.num_classes > -1:
+            x = fluid.layers.reshape(x, shape=[-1, self.fc_input_dim])
+            x = self.fc(x)
        return x
-def _resnet(arch, Block, depth, pretrained):
+def _resnet(arch, Block, depth, pretrained, **kwargs):
-    model = ResNet(Block, depth)
+    model = ResNet(Block, depth, **kwargs)
    if pretrained:
        assert arch in model_urls, "{} model do not have a pretrained model now, you should set pretrained=False".format(
            arch)
@@ -248,46 +275,46 @@ def _resnet(arch, Block, depth, pretrained):
    return model
-def resnet18(pretrained=False):
+def resnet18(pretrained=False, **kwargs):
    """ResNet 18-layer model
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
-    return _resnet('resnet18', BasicBlock, 18, pretrained)
+    return _resnet('resnet18', BasicBlock, 18, pretrained, **kwargs)
-def resnet34(pretrained=False):
+def resnet34(pretrained=False, **kwargs):
    """ResNet 34-layer model
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
-    return _resnet('resnet34', BasicBlock, 34, pretrained)
+    return _resnet('resnet34', BasicBlock, 34, pretrained, **kwargs)
-def resnet50(pretrained=False):
+def resnet50(pretrained=False, **kwargs):
    """ResNet 50-layer model
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
-    return _resnet('resnet50', BottleneckBlock, 50, pretrained)
+    return _resnet('resnet50', BottleneckBlock, 50, pretrained, **kwargs)
-def resnet101(pretrained=False):
+def resnet101(pretrained=False, **kwargs):
    """ResNet 101-layer model
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
-    return _resnet('resnet101', BottleneckBlock, 101, pretrained)
+    return _resnet('resnet101', BottleneckBlock, 101, pretrained, **kwargs)
-def resnet152(pretrained=False):
+def resnet152(pretrained=False, **kwargs):
    """ResNet 152-layer model
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
-    return _resnet('resnet152', BottleneckBlock, 152, pretrained)
+    return _resnet('resnet152', BottleneckBlock, 152, pretrained, **kwargs)
--- a/models/vgg.py
+++ b/models/vgg.py
@@ -17,18 +17,14 @@ import paddle.fluid as fluid
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
 from paddle.fluid.dygraph.container import Sequential
-from model import Model
+from hapi.model import Model
-from .download import get_weights_path
+from hapi.download import get_weights_path
 __all__ = [
    'VGG',
    'vgg11',
-    'vgg11_bn',
    'vgg13',
-    'vgg13_bn',
    'vgg16',
-    'vgg16_bn',
-    'vgg19_bn',
    'vgg19',
 ]
@@ -39,11 +35,11 @@ model_urls = {
 class Classifier(fluid.dygraph.Layer):
-    def __init__(self, num_classes):
+    def __init__(self, num_classes, classifier_activation='softmax'):
        super(Classifier, self).__init__()
        self.linear1 = Linear(512 * 7 * 7, 4096)
        self.linear2 = Linear(4096, 4096)
-        self.linear3 = Linear(4096, num_classes, act='softmax')
+        self.linear3 = Linear(4096, num_classes, act=classifier_activation)
    def forward(self, x):
        x = self.linear1(x)
@@ -62,20 +58,30 @@ class VGG(Model):
    Args:
        features (fluid.dygraph.Layer): vgg features create by function make_layers.
-        num_classes (int): output dim of last fc layer. Default: 1000.
+        num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
+                            will not be defined. Default: 1000.
+        classifier_activation (str): activation for the last fc layer. Default: 'softmax'.
    """
-    def __init__(self, features, num_classes=1000):
+    def __init__(self,
+                 features,
+                 num_classes=1000,
+                 classifier_activation='softmax'):
        super(VGG, self).__init__()
        self.features = features
-        classifier = Classifier(num_classes)
+        self.num_classes = num_classes
-        self.classifier = self.add_sublayer("classifier",
-                                            Sequential(classifier))
+        if num_classes > 0:
+            classifier = Classifier(num_classes, classifier_activation)
+            self.classifier = self.add_sublayer("classifier",
+                                                Sequential(classifier))
    def forward(self, x):
        x = self.features(x)
-        x = fluid.layers.flatten(x, 1)
-        x = self.classifier(x)
+        if self.num_classes > 0:
+            x = fluid.layers.flatten(x, 1)
+            x = self.classifier(x)
        return x
@@ -114,7 +120,10 @@ cfgs = {
 def _vgg(arch, cfg, batch_norm, pretrained, **kwargs):
-    model = VGG(make_layers(cfgs[cfg], batch_norm=batch_norm), **kwargs)
+    model = VGG(make_layers(
+        cfgs[cfg], batch_norm=batch_norm),
+                num_classes=1000,
+                **kwargs)
    if pretrained:
        assert arch in model_urls, "{} model do not have a pretrained model now, you should set pretrained=False".format(
@@ -128,73 +137,53 @@ def _vgg(arch, cfg, batch_norm, pretrained, **kwargs):
    return model
-def vgg11(pretrained=False, **kwargs):
+def vgg11(pretrained=False, batch_norm=False, **kwargs):
    """VGG 11-layer model
    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
+        batch_norm (bool): If True, returns a model with batch_norm layer. Default: False.
    """
-    return _vgg('vgg11', 'A', False, pretrained, **kwargs)
+    model_name = 'vgg11'
+    if batch_norm:
+        model_name += ('_bn')
+    return _vgg(model_name, 'A', batch_norm, pretrained, **kwargs)
-def vgg11_bn(pretrained=False, **kwargs):
-    """VGG 11-layer model with batch normalization
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    """
-    return _vgg('vgg11_bn', 'A', True, pretrained, **kwargs)
+def vgg13(pretrained=False, batch_norm=False, **kwargs):
-def vgg13(pretrained=False, **kwargs):
    """VGG 13-layer model
    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
-    """
+        batch_norm (bool): If True, returns a model with batch_norm layer. Default: False.
-    return _vgg('vgg13', 'B', False, pretrained, **kwargs)
-def vgg13_bn(pretrained=False, **kwargs):
-    """VGG 13-layer model with batch normalization
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
-    return _vgg('vgg13_bn', 'B', True, pretrained, **kwargs)
+    model_name = 'vgg13'
+    if batch_norm:
+        model_name += ('_bn')
+    return _vgg(model_name, 'B', batch_norm, pretrained, **kwargs)
-def vgg16(pretrained=False, **kwargs):
+def vgg16(pretrained=False, batch_norm=False, **kwargs):
    """VGG 16-layer model 
    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
-    """
+        batch_norm (bool): If True, returns a model with batch_norm layer. Default: False.
-    return _vgg('vgg16', 'D', False, pretrained, **kwargs)
-def vgg16_bn(pretrained=False, **kwargs):
-    """VGG 16-layer with batch normalization
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
-    return _vgg('vgg16_bn', 'D', True, pretrained, **kwargs)
+    model_name = 'vgg16'
+    if batch_norm:
+        model_name += ('_bn')
+    return _vgg(model_name, 'D', batch_norm, pretrained, **kwargs)
-def vgg19(pretrained=False, **kwargs):
+def vgg19(pretrained=False, batch_norm=False, **kwargs):
    """VGG 19-layer model 
    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False.
-    """
+        batch_norm (bool): If True, returns a model with batch_norm layer. Default: False.
-    return _vgg('vgg19', 'E', False, pretrained, **kwargs)
-def vgg19_bn(pretrained=False, **kwargs):
-    """VGG 19-layer model with batch normalization
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
-    return _vgg('vgg19_bn', 'E', True, pretrained, **kwargs)
+    model_name = 'vgg19'
+    if batch_norm:
+        model_name += ('_bn')
+    return _vgg(model_name, 'E', batch_norm, pretrained, **kwargs)
--- a/hapi/vision/transforms/__init__.py
+++ b/hapi/vision/transforms/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from . import transforms
+from . import functional
+from .transforms import *
+from .functional import *
+__all__ = transforms.__all__ \
+        + functional.__all__
--- a/hapi/vision/transforms/functional.py
+++ b/hapi/vision/transforms/functional.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import collections
+import random
+import cv2
+import numpy as np
+if sys.version_info < (3, 3):
+    Sequence = collections.Sequence
+    Iterable = collections.Iterable
+else:
+    Sequence = collections.abc.Sequence
+    Iterable = collections.abc.Iterable
+__all__ = ['flip', 'resize']
+def flip(image, code):
+    """
+    Accordding to the code (the type of flip), flip the input image
+    Args:
+        image: Input image, with (H, W, C) shape
+        code: code that indicates the type of flip.
+            -1 : Flip horizontally and vertically
+            0 : Flip vertically
+            1 : Flip horizontally
+    """
+    return cv2.flip(image, flipCode=code)
+def resize(img, size, interpolation=cv2.INTER_LINEAR):
+    """
+    resize the input data to given size
+    Args:
+        input: Input data, could be image or masks, with (H, W, C) shape
+        size: Target size of input data, with (height, width) shape.
+        interpolation: Interpolation method.
+    """
+    if isinstance(interpolation, Sequence):
+        interpolation = random.choice(interpolation)
+    if isinstance(size, int):
+        h, w = img.shape[:2]
+        if (w <= h and w == size) or (h <= w and h == size):
+            return img
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+            return cv2.resize(img, (ow, oh), interpolation=interpolation)
+        else:
+            oh = size
+            ow = int(size * w / h)
+            return cv2.resize(img, (ow, oh), interpolation=interpolation)
+    else:
+        return cv2.resize(img, size[::-1], interpolation=interpolation)
--- a/hapi/vision/transforms/transforms.py
+++ b/hapi/vision/transforms/transforms.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import division
+import math
+import sys
+import random
+import cv2
+import numpy as np
+import numbers
+import types
+import collections
+import warnings
+import traceback
+from . import functional as F
+if sys.version_info < (3, 3):
+    Iterable = collections.Iterable
+else:
+    Iterable = collections.abc.Iterable
+__all__ = [
+    "Compose",
+    "BatchCompose",
+    "Resize",
+    "RandomResizedCrop",
+    "CenterCropResize",
+    "CenterCrop",
+    "RandomHorizontalFlip",
+    "RandomVerticalFlip",
+    "Permute",
+    "Normalize",
+    "GaussianNoise",
+    "BrightnessTransform",
+    "SaturationTransform",
+    "ContrastTransform",
+    "HueTransform",
+    "ColorJitter",
+]
+class Compose(object):
+    """Composes several transforms together.
+    Args:
+        transforms (list of ``Transform`` objects): list of transforms to compose.
+    """
+    def __init__(self, transforms):
+        self.transforms = transforms
+    def __call__(self, *data):
+        for f in self.transforms:
+            try:
+                data = f(*data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                print("fail to perform transform [{}] with error: "
+                      "{} and stack:\n{}".format(f, e, str(stack_info)))
+                raise e
+        return data
+    def __repr__(self):
+        format_string = self.__class__.__name__ + '('
+        for t in self.transforms:
+            format_string += '\n'
+            format_string += '    {0}'.format(t)
+        format_string += '\n)'
+        return format_string
+class BatchCompose(object):
+    """Composes several batch transforms together
+    Args:
+        transforms (list of ``Transform`` objects): list of transforms to compose.
+                                            these transforms perform on batch data.
+    """
+    def __init__(self, transforms=[]):
+        self.transforms = transforms
+    def __call__(self, data):
+        for f in self.transforms:
+            try:
+                data = f(data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                print("fail to perform batch transform [{}] with error: "
+                      "{} and stack:\n{}".format(f, e, str(stack_info)))
+                raise e
+        # sample list to batch data
+        batch = list(zip(*data))
+        return batch
+class Resize(object):
+    """Resize the input Image to the given size.
+    Args:
+        size (int|list|tuple): Desired output size. If size is a sequence like
+            (h, w), output size will be matched to this. If size is an int,
+            smaller edge of the image will be matched to this number.
+            i.e, if height > width, then image will be rescaled to
+            (size * height / width, size)
+        interpolation (int): interpolation mode of resize. Default: cv2.INTER_LINEAR.
+    """
+    def __init__(self, size, interpolation=cv2.INTER_LINEAR):
+        assert isinstance(size, int) or (isinstance(size, Iterable) and
+                                         len(size) == 2)
+        self.size = size
+        self.interpolation = interpolation
+    def __call__(self, img, lbl):
+        return F.resize(img, self.size, self.interpolation), lbl
+class RandomResizedCrop(object):
+    """Crop the input data to random size and aspect ratio.
+    A crop of random size (default: of 0.08 to 1.0) of the original size and a random
+    aspect ratio (default: of 3/4 to 1.33) of the original aspect ratio is made.
+    After applying crop transfrom, the input data will be resized to given size.
+    Args:
+        output_size (int|list|tuple): Target size of output image, with (height, width) shape.
+        scale (list|tuple): Range of size of the origin size cropped. Default: (0.08, 1.0)
+        ratio (list|tuple): Range of aspect ratio of the origin aspect ratio cropped. Default: (0.75, 1.33)
+    """
+    def __init__(self,
+                 output_size,
+                 scale=(0.08, 1.0),
+                 ratio=(3. / 4, 4. / 3),
+                 interpolation=cv2.INTER_LINEAR):
+        if isinstance(output_size, int):
+            self.output_size = (output_size, output_size)
+        else:
+            self.output_size = output_size
+        assert (scale[0] <= scale[1]), "scale should be of kind (min, max)"
+        assert (ratio[0] <= ratio[1]), "ratio should be of kind (min, max)"
+        self.scale = scale
+        self.ratio = ratio
+        self.interpolation = interpolation
+    def _get_params(self, image, attempts=10):
+        height, width, _ = image.shape
+        area = height * width
+        for _ in range(attempts):
+            target_area = np.random.uniform(*self.scale) * area
+            log_ratio = tuple(math.log(x) for x in self.ratio)
+            aspect_ratio = math.exp(np.random.uniform(*log_ratio))
+            w = int(round(math.sqrt(target_area * aspect_ratio)))
+            h = int(round(math.sqrt(target_area / aspect_ratio)))
+            if 0 < w <= width and 0 < h <= height:
+                x = np.random.randint(0, width - w + 1)
+                y = np.random.randint(0, height - h + 1)
+                return x, y, w, h
+        # Fallback to central crop
+        in_ratio = float(width) / float(height)
+        if in_ratio < min(self.ratio):
+            w = width
+            h = int(round(w / min(self.ratio)))
+        elif in_ratio > max(self.ratio):
+            h = height
+            w = int(round(h * max(self.ratio)))
+        else:  # whole image
+            w = width
+            h = height
+        x = (width - w) // 2
+        y = (height - h) // 2
+        return x, y, w, h
+    def __call__(self, img, lbl):
+        x, y, w, h = self._get_params(img)
+        cropped_img = img[y:y + h, x:x + w]
+        return F.resize(cropped_img, self.output_size, self.interpolation), lbl
+class CenterCropResize(object):
+    """Crops to center of image with padding then scales size.
+    Args:
+        size (int|list|tuple): Target size of output image, with (height, width) shape.
+        crop_padding (int): center crop with the padding. Default: 32.
+        interpolation (int): interpolation mode of resize. Default: cv2.INTER_LINEAR.
+    """
+    def __init__(self, size, crop_padding=32, interpolation=cv2.INTER_LINEAR):
+        if isinstance(size, int):
+            self.size = (size, size)
+        else:
+            self.size = size
+        self.crop_padding = crop_padding
+        self.interpolation = interpolation
+    def _get_params(self, img):
+        h, w = img.shape[:2]
+        size = min(self.size)
+        c = int(size / (size + self.crop_padding) * min((h, w)))
+        x = (h + 1 - c) // 2
+        y = (w + 1 - c) // 2
+        return c, x, y
+    def __call__(self, img, lbl):
+        c, x, y = self._get_params(img)
+        cropped_img = img[x:x + c, y:y + c, :]
+        return F.resize(cropped_img, self.size, self.interpolation), lbl
+class CenterCrop(object):
+    """Crops the given the input data at the center.
+    Args:
+        output_size: Target size of output image, with (height, width) shape.
+    """
+    def __init__(self, output_size):
+        if isinstance(output_size, int):
+            self.output_size = (output_size, output_size)
+        else:
+            self.output_size = output_size
+    def _get_params(self, img):
+        th, tw = self.output_size
+        h, w, _ = img.shape
+        assert th <= h and tw <= w, "output size is bigger than image size"
+        x = int(round((w - tw) / 2.0))
+        y = int(round((h - th) / 2.0))
+        return x, y
+    def __call__(self, img, lbl):
+        x, y = self._get_params(img)
+        th, tw = self.output_size
+        return img[y:y + th, x:x + tw], lbl
+class RandomHorizontalFlip(object):
+    """Horizontally flip the input data randomly with a given probability.
+    Args:
+        prob (float): probability of the input data being flipped. Default: 0.5
+    """
+    def __init__(self, prob=0.5):
+        self.prob = prob
+    def __call__(self, img, lbl):
+        if np.random.random() < self.prob:
+            return F.flip(img, code=1), lbl
+        return img, lbl
+class RandomVerticalFlip(object):
+    """Vertically flip the input data randomly with a given probability.
+    Args:
+        prob (float): probability of the input data being flipped. Default: 0.5
+    """
+    def __init__(self, prob=0.5):
+        self.prob = prob
+    def __call__(self, img, lbl):
+        if np.random.random() < self.prob:
+            return F.flip(img, code=0), lbl
+        return img, lbl
+class Normalize(object):
+    """Normalize the input data with mean and standard deviation.
+    Given mean: ``(M1,...,Mn)`` and std: ``(S1,..,Sn)`` for ``n`` channels,
+    this transform will normalize each channel of the input data.
+    ``output[channel] = (input[channel] - mean[channel]) / std[channel]``
+    Args:
+        mean (int|float|list): Sequence of means for each channel.
+        std (int|float|list): Sequence of standard deviations for each channel.
+    """
+    def __init__(self, mean=0.0, std=1.0):
+        if isinstance(mean, numbers.Number):
+            mean = [mean, mean, mean]
+        if isinstance(std, numbers.Number):
+            mean = [std, std, std]
+        self.mean = np.array(mean, dtype=np.float32).reshape(len(mean), 1, 1)
+        self.std = np.array(std, dtype=np.float32).reshape(len(std), 1, 1)
+    def __call__(self, img, lbl):
+        return (img - self.mean) / self.std, lbl
+class Permute(object):
+    """Change input data to a target mode.
+    For example, most transforms use HWC mode image,
+    while the Neural Network might use CHW mode input tensor.
+    Input image should be HWC mode and an instance of numpy.ndarray. 
+    Args:
+        mode: Output mode of input. Default: "CHW".
+        to_rgb: convert 'bgr' image to 'rgb'. Default: True.
+    """
+    def __init__(self, mode="CHW", to_rgb=True):
+        assert mode in [
+            "CHW"
+        ], "Only support 'CHW' mode, but received mode: {}".format(mode)
+        self.mode = mode
+        self.to_rgb = to_rgb
+    def __call__(self, img, lbl):
+        if self.to_rgb:
+            img = img[..., ::-1]
+        if self.mode == "CHW":
+            return img.transpose((2, 0, 1)), lbl
+        return img, lbl
+class GaussianNoise(object):
+    """Add random gaussian noise to the input data.
+    Gaussian noise is generated with given mean and std.
+    Args:
+        mean: Gaussian mean used to generate noise.
+        std: Gaussian standard deviation used to generate noise.
+    """
+    def __init__(self, mean=0.0, std=1.0):
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+    def __call__(self, img, lbl):
+        dtype = img.dtype
+        noise = np.random.normal(self.mean, self.std, img.shape) * 255
+        img = img + noise.astype(np.float32)
+        return np.clip(img, 0, 255).astype(dtype), lbl
+class BrightnessTransform(object):
+    """Adjust brightness of the image.
+    Args:
+        value: How much to adjust the brightness. Can be any
+            non negative number. 0 gives the original image
+    """
+    def __init__(self, value):
+        if value < 0:
+            raise ValueError("brightness value should be non-negative")
+        self.value = value
+    def __call__(self, img, lbl):
+        if self.value == 0:
+            return img, lbl
+        dtype = img.dtype
+        img = img.astype(np.float32)
+        alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value)
+        img = img * alpha
+        return img.clip(0, 255).astype(dtype), lbl
+class ContrastTransform(object):
+    """Adjust contrast of the image.
+    Args:
+        value: How much to adjust the contrast. Can be any
+            non negative number. 0 gives the original image
+    """
+    def __init__(self, value):
+        if value < 0:
+            raise ValueError("contrast value should be non-negative")
+        self.value = value
+    def __call__(self, img, lbl):
+        if self.value == 0:
+            return img, lbl
+        dtype = img.dtype
+        img = img.astype(np.float32)
+        alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value)
+        img = img * alpha + cv2.cvtColor(img, cv2.COLOR_BGR2GRAY).mean() * (
+            1 - alpha)
+        return img.clip(0, 255).astype(dtype), lbl
+class SaturationTransform(object):
+    """Adjust saturation of the image.
+    Args:
+        value: How much to adjust the saturation. Can be any
+            non negative number. 0 gives the original image
+    """
+    def __init__(self, value):
+        if value < 0:
+            raise ValueError("saturation value should be non-negative")
+        self.value = value
+    def __call__(self, img, lbl):
+        if self.value == 0:
+            return img, lbl
+        dtype = img.dtype
+        img = img.astype(np.float32)
+        alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value)
+        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        gray_img = gray_img[..., np.newaxis]
+        img = img * alpha + gray_img * (1 - alpha)
+        return img.clip(0, 255).astype(dtype), lbl
+class HueTransform(object):
+    """Adjust hue of the image.
+    Args:
+        value: How much to adjust the hue. Can be any number
+            between 0 and 0.5, 0 gives the original image
+    """
+    def __init__(self, value):
+        if value < 0 or value > 0.5:
+            raise ValueError("hue value should be in [0.0, 0.5]")
+        self.value = value
+    def __call__(self, img, lbl):
+        if self.value == 0:
+            return img, lbl
+        dtype = img.dtype
+        img = img.astype(np.uint8)
+        hsv_img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV_FULL)
+        h, s, v = cv2.split(hsv_img)
+        alpha = np.random.uniform(-self.value, self.value)
+        h = h.astype(np.uint8)
+        # uint8 addition take cares of rotation across boundaries
+        with np.errstate(over="ignore"):
+            h += np.uint8(alpha * 255)
+        hsv_img = cv2.merge([h, s, v])
+        return cv2.cvtColor(hsv_img, cv2.COLOR_HSV2BGR_FULL).astype(dtype), lbl
+class ColorJitter(object):
+    """Randomly change the brightness, contrast, saturation and hue of an image.
+    Args:
+        brightness: How much to jitter brightness.
+            Chosen uniformly from [max(0, 1 - brightness), 1 + brightness]
+            or the given [min, max]. Should be non negative numbers.
+        contrast: How much to jitter contrast.
+            Chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
+            or the given [min, max]. Should be non negative numbers.
+        saturation: How much to jitter saturation.
+            Chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
+            or the given [min, max]. Should be non negative numbers.
+        hue: How much to jitter hue.
+            Chosen uniformly from [-hue, hue] or the given [min, max].
+            Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5.
+    """
+    def __init__(self, brightness=0, contrast=0, saturation=0, hue=0):
+        transforms = []
+        if brightness != 0:
+            transforms.append(BrightnessTransform(brightness))
+        if contrast != 0:
+            transforms.append(ContrastTransform(contrast))
+        if saturation != 0:
+            transforms.append(SaturationTransform(saturation))
+        if hue != 0:
+            transforms.append(HueTransform(hue))
+        random.shuffle(transforms)
+        self.transforms = Compose(transforms)
+    def __call__(self, img, lbl):
+        return self.transforms(img, lbl)
--- a/lac.py
+++ b/lac.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-lexical analysis network structure
-"""
-from __future__ import division
-from __future__ import print_function
-import io
-import os
-import sys
-import math
-import argparse
-import numpy as np
-from metrics import Metric
-from model import Model, Input, Loss, set_device
-import paddle.fluid as fluid
-from paddle.fluid.optimizer import AdamOptimizer
-from paddle.fluid.initializer import NormalInitializer
-from paddle.fluid.dygraph.nn import Embedding, Linear, GRUUnit
-class DynamicGRU(fluid.dygraph.Layer):
-    def __init__(self,
-                 size,
-                 h_0=None,
-                 param_attr=None,
-                 bias_attr=None,
-                 is_reverse=False,
-                 gate_activation='sigmoid',
-                 candidate_activation='tanh',
-                 origin_mode=False,
-                 init_size=None):
-        super(DynamicGRU, self).__init__()
-        self.gru_unit = GRUUnit(
-            size * 3,
-            param_attr=param_attr,
-            bias_attr=bias_attr,
-            activation=candidate_activation,
-            gate_activation=gate_activation,
-            origin_mode=origin_mode)
-        self.size = size
-        self.h_0 = h_0
-        self.is_reverse = is_reverse
-    def forward(self, inputs):
-        hidden = self.h_0
-        res = []
-        for i in range(inputs.shape[1]):
-            if self.is_reverse:
-                i = inputs.shape[1] - 1 - i
-            input_ = inputs[:, i:i + 1, :]
-            input_ = fluid.layers.reshape(
-                input_, [-1, input_.shape[2]], inplace=False)
-            hidden, reset, gate = self.gru_unit(input_, hidden)
-            hidden_ = fluid.layers.reshape(
-                hidden, [-1, 1, hidden.shape[1]], inplace=False)
-            res.append(hidden_)
-        if self.is_reverse:
-            res = res[::-1]
-        res = fluid.layers.concat(res, axis=1)
-        return res
-class BiGRU(fluid.dygraph.Layer):
-    def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None):
-        super(BiGRU, self).__init__()
-        self.pre_gru = Linear(
-            input_dim=input_dim,
-            output_dim=grnn_hidden_dim * 3,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
-                    low=-init_bound, high=init_bound),
-                regularizer=fluid.regularizer.L2DecayRegularizer(
-                    regularization_coeff=1e-4)))
-        self.gru = DynamicGRU(
-            size=grnn_hidden_dim,
-            h_0=h_0,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
-                    low=-init_bound, high=init_bound),
-                regularizer=fluid.regularizer.L2DecayRegularizer(
-                    regularization_coeff=1e-4)))
-        self.pre_gru_r = Linear(
-            input_dim=input_dim,
-            output_dim=grnn_hidden_dim * 3,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
-                    low=-init_bound, high=init_bound),
-                regularizer=fluid.regularizer.L2DecayRegularizer(
-                    regularization_coeff=1e-4)))
-        self.gru_r = DynamicGRU(
-            size=grnn_hidden_dim,
-            is_reverse=True,
-            h_0=h_0,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
-                    low=-init_bound, high=init_bound),
-                regularizer=fluid.regularizer.L2DecayRegularizer(
-                    regularization_coeff=1e-4)))
-    def forward(self, input_feature):
-        res_pre_gru = self.pre_gru(input_feature)
-        res_gru = self.gru(res_pre_gru)
-        res_pre_gru_r = self.pre_gru_r(input_feature)
-        res_gru_r = self.gru_r(res_pre_gru_r)
-        bi_merge = fluid.layers.concat(input=[res_gru, res_gru_r], axis=-1)
-        return bi_merge
-class Linear_chain_crf(fluid.dygraph.Layer):
-    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
-        super(Linear_chain_crf, self).__init__()
-        self._param_attr = param_attr
-        self._dtype = dtype
-        self._size = size
-        self._is_test = is_test
-        self._transition = self.create_parameter(
-            attr=self._param_attr,
-            shape=[self._size + 2, self._size],
-            dtype=self._dtype)
-    @property
-    def weight(self):
-        return self._transition
-    @weight.setter
-    def weight(self, value):
-        self._transition = value
-    def forward(self, input, label, length=None):
-        alpha = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        emission_exps = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        transition_exps = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        log_likelihood = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        this_inputs = {
-            "Emission": [input],
-            "Transition": self._transition,
-            "Label": [label]
-        }
-        if length:
-            this_inputs['Length'] = [length]
-        self._helper.append_op(
-            type='linear_chain_crf',
-            inputs=this_inputs,
-            outputs={
-                "Alpha": [alpha],
-                "EmissionExps": [emission_exps],
-                "TransitionExps": transition_exps,
-                "LogLikelihood": log_likelihood
-            },
-            attrs={"is_test": self._is_test, })
-        return log_likelihood
-class Crf_decoding(fluid.dygraph.Layer):
-    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
-        super(Crf_decoding, self).__init__()
-        self._dtype = dtype
-        self._size = size
-        self._is_test = is_test
-        self._param_attr = param_attr
-        self._transition = self.create_parameter(
-            attr=self._param_attr,
-            shape=[self._size + 2, self._size],
-            dtype=self._dtype)
-    @property
-    def weight(self):
-        return self._transition
-    @weight.setter
-    def weight(self, value):
-        self._transition = value
-    def forward(self, input, label=None, length=None):
-        viterbi_path = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-        this_inputs = {
-            "Emission": [input],
-            "Transition": self._transition,
-            "Label": label
-        }
-        if length:
-            this_inputs['Length'] = [length]
-        self._helper.append_op(
-            type='crf_decoding',
-            inputs=this_inputs,
-            outputs={"ViterbiPath": [viterbi_path]},
-            attrs={"is_test": self._is_test, })
-        return viterbi_path
-class Chunk_eval(fluid.dygraph.Layer):
-    def __init__(self,
-                 num_chunk_types,
-                 chunk_scheme,
-                 excluded_chunk_types=None):
-        super(Chunk_eval, self).__init__()
-        self.num_chunk_types = num_chunk_types
-        self.chunk_scheme = chunk_scheme
-        self.excluded_chunk_types = excluded_chunk_types
-    def forward(self, input, label, seq_length=None):
-        precision = self._helper.create_variable_for_type_inference(
-            dtype="float32")
-        recall = self._helper.create_variable_for_type_inference(
-            dtype="float32")
-        f1_score = self._helper.create_variable_for_type_inference(
-            dtype="float32")
-        num_infer_chunks = self._helper.create_variable_for_type_inference(
-            dtype="int64")
-        num_label_chunks = self._helper.create_variable_for_type_inference(
-            dtype="int64")
-        num_correct_chunks = self._helper.create_variable_for_type_inference(
-            dtype="int64")
-        this_input = {"Inference": input, "Label": label[0]}
-        if seq_length:
-            this_input["SeqLength"] = seq_length[0]
-        self._helper.append_op(
-            type='chunk_eval',
-            inputs=this_input,
-            outputs={
-                "Precision": [precision],
-                "Recall": [recall],
-                "F1-Score": [f1_score],
-                "NumInferChunks": [num_infer_chunks],
-                "NumLabelChunks": [num_label_chunks],
-                "NumCorrectChunks": [num_correct_chunks]
-            },
-            attrs={
-                "num_chunk_types": self.num_chunk_types,
-                "chunk_scheme": self.chunk_scheme,
-                "excluded_chunk_types": self.excluded_chunk_types or []
-            })
-        return (num_infer_chunks, num_label_chunks, num_correct_chunks)
-class LAC(Model):
-    def __init__(self, args, vocab_size, num_labels, length=None):
-        super(LAC, self).__init__()
-        """
-        define the lexical analysis network structure
-        word: stores the input of the model
-        for_infer: a boolean value, indicating if the model to be created is for training or predicting.
-        return:
-            for infer: return the prediction
-            otherwise: return the prediction
-        """
-        self.word_emb_dim = args.word_emb_dim
-        self.vocab_size = vocab_size
-        self.num_labels = num_labels
-        self.grnn_hidden_dim = args.grnn_hidden_dim
-        self.emb_lr = args.emb_learning_rate if 'emb_learning_rate' in dir(
-            args) else 1.0
-        self.crf_lr = args.emb_learning_rate if 'crf_learning_rate' in dir(
-            args) else 1.0
-        self.bigru_num = args.bigru_num
-        self.init_bound = 0.1
-        self.word_embedding = Embedding(
-            size=[self.vocab_size, self.word_emb_dim],
-            dtype='float32',
-            param_attr=fluid.ParamAttr(
-                learning_rate=self.emb_lr,
-                name="word_emb",
-                initializer=fluid.initializer.Uniform(
-                    low=-self.init_bound, high=self.init_bound)))
-        h_0 = fluid.layers.create_global_var(
-            shape=[args.batch_size, self.grnn_hidden_dim],
-            value=0.0,
-            dtype='float32',
-            persistable=True,
-            force_cpu=True,
-            name='h_0')
-        self.bigru_units = []
-        for i in range(self.bigru_num):
-            if i == 0:
-                self.bigru_units.append(
-                    self.add_sublayer(
-                        "bigru_units%d" % i,
-                        BiGRU(
-                            self.grnn_hidden_dim,
-                            self.grnn_hidden_dim,
-                            self.init_bound,
-                            h_0=h_0)))
-            else:
-                self.bigru_units.append(
-                    self.add_sublayer(
-                        "bigru_units%d" % i,
-                        BiGRU(
-                            self.grnn_hidden_dim * 2,
-                            self.grnn_hidden_dim,
-                            self.init_bound,
-                            h_0=h_0)))
-        self.fc = Linear(
-            input_dim=self.grnn_hidden_dim * 2,
-            output_dim=self.num_labels,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Uniform(
-                    low=-self.init_bound, high=self.init_bound),
-                regularizer=fluid.regularizer.L2DecayRegularizer(
-                    regularization_coeff=1e-4)))
-        self.linear_chain_crf = Linear_chain_crf(
-            param_attr=fluid.ParamAttr(
-                name='linear_chain_crfw', learning_rate=self.crf_lr),
-            size=self.num_labels)
-        self.crf_decoding = Crf_decoding(
-            param_attr=fluid.ParamAttr(
-                name='crfw', learning_rate=self.crf_lr),
-            size=self.num_labels)
-    def forward(self, word, target, lengths):
-        """
-        Configure the network
-        """
-        word_embed = self.word_embedding(word)
-        input_feature = word_embed
-        for i in range(self.bigru_num):
-            bigru_output = self.bigru_units[i](input_feature)
-            input_feature = bigru_output
-        emission = self.fc(bigru_output)
-        crf_cost = self.linear_chain_crf(
-            input=emission, label=target, length=lengths)
-        avg_cost = fluid.layers.mean(x=crf_cost)
-        self.crf_decoding.weight = self.linear_chain_crf.weight
-        crf_decode = self.crf_decoding(input=emission, length=lengths)
-        return crf_decode, avg_cost, lengths
-class LacLoss(Loss):
-    def __init__(self):
-        super(LacLoss, self).__init__()
-        pass
-    def forward(self, outputs, labels):
-        avg_cost = outputs[1]
-        return avg_cost
-class ChunkEval(Metric):
-    def __init__(self, num_labels, name=None, *args, **kwargs):
-        super(ChunkEval, self).__init__(*args, **kwargs)
-        self._init_name(name)
-        self.chunk_eval = Chunk_eval(
-            int(math.ceil((num_labels - 1) / 2.0)), "IOB")
-        self.reset()
-    def add_metric_op(self, pred, label, *args, **kwargs):
-        crf_decode = pred[0]
-        lengths = pred[2]
-        (num_infer_chunks, num_label_chunks,
-         num_correct_chunks) = self.chunk_eval(
-             input=crf_decode, label=label, seq_length=lengths)
-        return [num_infer_chunks, num_label_chunks, num_correct_chunks]
-    def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks,
-               *args, **kwargs):
-        self.infer_chunks_total += num_infer_chunks
-        self.label_chunks_total += num_label_chunks
-        self.correct_chunks_total += num_correct_chunks
-        precision = float(
-            num_correct_chunks) / num_infer_chunks if num_infer_chunks else 0
-        recall = float(
-            num_correct_chunks) / num_label_chunks if num_label_chunks else 0
-        f1_score = float(2 * precision * recall) / (
-            precision + recall) if num_correct_chunks else 0
-        return [precision, recall, f1_score]
-    def reset(self):
-        self.infer_chunks_total = 0
-        self.label_chunks_total = 0
-        self.correct_chunks_total = 0
-    def accumulate(self):
-        precision = float(
-            self.correct_chunks_total
-        ) / self.infer_chunks_total if self.infer_chunks_total else 0
-        recall = float(
-            self.correct_chunks_total
-        ) / self.label_chunks_total if self.label_chunks_total else 0
-        f1_score = float(2 * precision * recall) / (
-            precision + recall) if self.correct_chunks_total else 0
-        res = [precision, recall, f1_score]
-        return res
-    def _init_name(self, name):
-        name = name or 'chunk eval'
-        self._name = ['precision', 'recall', 'F1']
-    def name(self):
-        return self._name
-class LacDataset(object):
-    """
-    Load lexical analysis dataset
-    """
-    def __init__(self, args):
-        self.word_dict_path = args.word_dict_path
-        self.label_dict_path = args.label_dict_path
-        self.word_rep_dict_path = args.word_rep_dict_path
-        self._load_dict()
-    def _load_dict(self):
-        self.word2id_dict = self.load_kv_dict(
-            self.word_dict_path, reverse=True, value_func=np.int64)
-        self.id2word_dict = self.load_kv_dict(self.word_dict_path)
-        self.label2id_dict = self.load_kv_dict(
-            self.label_dict_path, reverse=True, value_func=np.int64)
-        self.id2label_dict = self.load_kv_dict(self.label_dict_path)
-        if self.word_rep_dict_path is None:
-            self.word_replace_dict = dict()
-        else:
-            self.word_replace_dict = self.load_kv_dict(self.word_rep_dict_path)
-    def load_kv_dict(self,
-                     dict_path,
-                     reverse=False,
-                     delimiter="\t",
-                     key_func=None,
-                     value_func=None):
-        """
-        Load key-value dict from file
-        """
-        result_dict = {}
-        for line in io.open(dict_path, "r", encoding='utf8'):
-            terms = line.strip("\n").split(delimiter)
-            if len(terms) != 2:
-                continue
-            if reverse:
-                value, key = terms
-            else:
-                key, value = terms
-            if key in result_dict:
-                raise KeyError("key duplicated with [%s]" % (key))
-            if key_func:
-                key = key_func(key)
-            if value_func:
-                value = value_func(value)
-            result_dict[key] = value
-        return result_dict
-    @property
-    def vocab_size(self):
-        return len(self.word2id_dict.values())
-    @property
-    def num_labels(self):
-        return len(self.label2id_dict.values())
-    def get_num_examples(self, filename):
-        """num of line of file"""
-        return sum(1 for line in io.open(filename, "r", encoding='utf8'))
-    def word_to_ids(self, words):
-        """convert word to word index"""
-        word_ids = []
-        for word in words:
-            word = self.word_replace_dict.get(word, word)
-            if word not in self.word2id_dict:
-                word = "OOV"
-            word_id = self.word2id_dict[word]
-            word_ids.append(word_id)
-        return word_ids
-    def label_to_ids(self, labels):
-        """convert label to label index"""
-        label_ids = []
-        for label in labels:
-            if label not in self.label2id_dict:
-                label = "O"
-            label_id = self.label2id_dict[label]
-            label_ids.append(label_id)
-        return label_ids
-    def file_reader(self,
-                    filename,
-                    mode="train",
-                    batch_size=32,
-                    max_seq_len=126):
-        """
-        yield (word_idx, target_idx) one by one from file,
-            or yield (word_idx, ) in `infer` mode
-        """
-        def wrapper():
-            fread = io.open(filename, "r", encoding="utf-8")
-            headline = next(fread)
-            headline = headline.strip().split('\t')
-            assert len(headline) == 2 and headline[0] == "text_a" and headline[
-                1] == "label"
-            buf = []
-            for line in fread:
-                words, labels = line.strip("\n").split("\t")
-                if len(words) < 1:
-                    continue
-                word_ids = self.word_to_ids(words.split("\002"))
-                label_ids = self.label_to_ids(labels.split("\002"))
-                assert len(word_ids) == len(label_ids)
-                word_ids = word_ids[0:max_seq_len]
-                words_len = np.int64(len(word_ids))
-                word_ids += [0 for _ in range(max_seq_len - words_len)]
-                label_ids = label_ids[0:max_seq_len]
-                label_ids += [0 for _ in range(max_seq_len - words_len)]
-                assert len(word_ids) == len(label_ids)
-                yield word_ids, label_ids, words_len
-            fread.close()
-        return wrapper
-def create_lexnet_data_generator(args, reader, file_name, place, mode="train"):
-    def wrapper():
-        batch_words, batch_labels, seq_lens = [], [], []
-        for epoch in xrange(args.epoch):
-            for instance in reader.file_reader(
-                    file_name, mode, max_seq_len=args.max_seq_len)():
-                words, labels, words_len = instance
-                if len(seq_lens) < args.batch_size:
-                    batch_words.append(words)
-                    batch_labels.append(labels)
-                    seq_lens.append(words_len)
-                if len(seq_lens) == args.batch_size:
-                    yield batch_words, batch_labels, seq_lens, batch_labels
-                    batch_words, batch_labels, seq_lens = [], [], []
-        if len(seq_lens) > 0:
-            yield batch_words, batch_labels, seq_lens, batch_labels
-            batch_words, batch_labels, seq_lens = [], [], []
-    return wrapper
-def create_dataloader(generator, place, feed_list=None):
-    if not feed_list:
-        data_loader = fluid.io.DataLoader.from_generator(
-            capacity=50,
-            use_double_buffer=True,
-            iterable=True,
-            return_list=True)
-    else:
-        data_loader = fluid.io.DataLoader.from_generator(
-            feed_list=feed_list,
-            capacity=50,
-            use_double_buffer=True,
-            iterable=True,
-            return_list=True)
-    data_loader.set_batch_generator(generator, places=place)
-    return data_loader
-def main(args):
-    place = set_device(args.device)
-    fluid.enable_dygraph(place) if args.dynamic else None
-    inputs = [
-        Input(
-            [None, args.max_seq_len], 'int64', name='words'), Input(
-                [None, args.max_seq_len], 'int64', name='target'), Input(
-                    [None], 'int64', name='length')
-    ]
-    labels = [Input([None, args.max_seq_len], 'int64', name='labels')]
-    feed = [x.forward() for x in inputs + labels]
-    dataset = LacDataset(args)
-    train_path = os.path.join(args.data, "train.tsv")
-    test_path = os.path.join(args.data, "test.tsv")
-    if args.dynamic:
-        feed_list = None
-    else:
-        feed_list = feed
-    train_generator = create_lexnet_data_generator(
-        args, reader=dataset, file_name=train_path, place=place, mode="train")
-    test_generator = create_lexnet_data_generator(
-        args, reader=dataset, file_name=test_path, place=place, mode="test")
-    train_dataset = create_dataloader(
-        train_generator, place, feed_list=feed_list)
-    test_dataset = create_dataloader(
-        test_generator, place, feed_list=feed_list)
-    vocab_size = dataset.vocab_size
-    num_labels = dataset.num_labels
-    model = LAC(args, vocab_size, num_labels)
-    optim = AdamOptimizer(
-        learning_rate=args.base_learning_rate,
-        parameter_list=model.parameters())
-    model.prepare(
-        optim,
-        LacLoss(),
-        ChunkEval(num_labels),
-        inputs=inputs,
-        labels=labels,
-        device=args.device)
-    if args.resume is not None:
-        model.load(args.resume)
-    model.fit(train_dataset,
-              test_dataset,
-              epochs=args.epoch,
-              batch_size=args.batch_size,
-              eval_freq=args.eval_freq,
-              save_freq=args.save_freq,
-              save_dir=args.save_dir)
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser("LAC training")
-    parser.add_argument(
-        "-dir", "--data", default=None, type=str, help='path to LAC dataset')
-    parser.add_argument(
-        "-wd",
-        "--word_dict_path",
-        default=None,
-        type=str,
-        help='word dict path')
-    parser.add_argument(
-        "-ld",
-        "--label_dict_path",
-        default=None,
-        type=str,
-        help='label dict path')
-    parser.add_argument(
-        "-wrd",
-        "--word_rep_dict_path",
-        default=None,
-        type=str,
-        help='The path of the word replacement Dictionary.')
-    parser.add_argument(
-        "-dev",
-        "--device",
-        type=str,
-        default='gpu',
-        help="device to use, gpu or cpu")
-    parser.add_argument(
-        "-d", "--dynamic", action='store_true', help="enable dygraph mode")
-    parser.add_argument(
-        "-e", "--epoch", default=10, type=int, help="number of epoch")
-    parser.add_argument(
-        '-lr',
-        '--base_learning_rate',
-        default=1e-3,
-        type=float,
-        metavar='LR',
-        help='initial learning rate')
-    parser.add_argument(
-        "--word_emb_dim",
-        default=128,
-        type=int,
-        help='word embedding dimension')
-    parser.add_argument(
-        "--grnn_hidden_dim", default=128, type=int, help="hidden dimension")
-    parser.add_argument(
-        "--bigru_num", default=2, type=int, help='the number of bi-rnn')
-    parser.add_argument("-elr", "--emb_learning_rate", default=1.0, type=float)
-    parser.add_argument("-clr", "--crf_learning_rate", default=1.0, type=float)
-    parser.add_argument(
-        "-b", "--batch_size", default=300, type=int, help="batch size")
-    parser.add_argument(
-        "--max_seq_len", default=126, type=int, help="max sequence length")
-    parser.add_argument(
-        "-n", "--num_devices", default=1, type=int, help="number of devices")
-    parser.add_argument(
-        "-r",
-        "--resume",
-        default=None,
-        type=str,
-        help="checkpoint path to resume")
-    parser.add_argument(
-        "-o",
-        "--save_dir",
-        default="./model",
-        type=str,
-        help="save model path")
-    parser.add_argument(
-        "-sf", "--save_freq", default=1, type=int, help="save frequency")
-    parser.add_argument(
-        "-ef", "--eval_freq", default=1, type=int, help="eval frequency")
-    args = parser.parse_args()
-    print(args)
-    main(args)
--- a/mnist.py
+++ b/mnist.py
@@ -24,7 +24,7 @@ import numpy as np
 from paddle import fluid
 from paddle.fluid.optimizer import Momentum
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
-from paddle.fluid.io import MNIST as MnistDataset
+from vision.datasets import MNIST as MnistDataset
 from model import Model, CrossEntropy, Input, set_device
 from metrics import Accuracy

--- a/setup.cfg
+++ b/setup.cfg
+[metadata]
+name = hapi
+author = zhouxiangyang
+author_email = zhouxiangyang@baidu.com
+version = 0.0.1
+description = HAPI
+long_description = file: README.md
+long_description_content_type = text/markdown
+home_page = https://github.com/PaddlePaddle/hapi
+license = Apache 2.0
+classifier =
+    Private :: Do Not Upload
+    Programming Language :: Python
+    Programming Language :: Python :: 2
+    Programming Language :: Python :: 2.7
+    Programming Language :: Python :: 3
+    Programming Language :: Python :: 3.5
+    Programming Language :: Python :: 3.6
+    Programming Language :: Python :: 3.7
+keywords =
+    paddlepaddle
+    paddle
+    high-level-api
+[options]
+packages = find:
+#install_requires =
+#    paddlepaddle-gpu >= 1.5.2
+include_package_data = True
+zip_safe = False
+[sdist]
+dist_dir = output/dist
+[bdist_wheel]
+dist_dir = output/dist
+[easy_install]
+index_url = http://pip.baidu.com/root/baidu/+simple/
--- a/setup.py
+++ b/setup.py
+# -*- coding: UTF-8 -*-
+################################################################################
+#
+#   Copyright (c) 2020  Baidu.com, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+"""
+Setup script.
+Authors: zhouxiangyang(zhouxiangyang@baidu.com)
+Date:    2020/2/4 00:00:01
+"""
+import setuptools
+with open("README.md", "r") as fh:
+    long_description = fh.read()
+setuptools.setup(
+    name="hapi",
+    version="0.0.1",
+    author="PaddlePaddle",
+    author_email="zhouxiangyang@baidu.com",
+    description="A Paddle High-level API that supports both static and dynamic execution modes (still under development)",
+    url="https://github.com/PaddlePaddle/hapi",
+    packages=[
+        'hapi',
+        'hapi.datasets',
+        'hapi.text',
+        'hapi.text.tokenizer',
+        'hapi.text.bert',
+        'hapi.text.bert.utils',
+        'hapi.vision',
+        'hapi.vision.models',
+        'hapi.vision.transforms',
+    ],
+    package_dir={
+        'hapi': './hapi',
+        'hapi.datasets': './hapi/datasets',
+        'hapi.text': './hapi/text',
+        'hapi.text.tokenizer': './hapi/text/tokenizer',
+        'hapi.text.bert': './hapi/text/bert',
+        'hapi.text.bert.utils': './hapi/text/bert/utils',
+        'hapi.vision': './hapi/vision',
+        'hapi.vision.models': './hapi/vision/models',
+        'hapi.vision.transforms': './hapi/vision/transforms',
+    },
+    platforms="any",
+    license='Apache 2.0',
+    classifiers=[
+        'License :: OSI Approved :: Apache Software License',
+        'Programming Language :: Python',
+        'Programming Language :: Python :: 2',
+        'Programming Language :: Python :: 2.7',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
+    ], )
--- a/tests/test_bert_dataloader.py
+++ b/tests/test_bert_dataloader.py
+import paddle
+from hapi.model import set_device
+from hapi.text.bert.dataloader import SingleSentenceDataLoader
+import hapi.text.tokenizer.tokenization as tokenization
+device = set_device("cpu")
+paddle.fluid.enable_dygraph(device)
+tokenizer = tokenization.FullTokenizer(
+    vocab_file="./tmp/hapi/data/pretrained_models/uncased_L-12_H-768_A-12/vocab.txt",
+    do_lower_case=True)
+bert_dataloader = SingleSentenceDataLoader(
+    "./tmp/hapi/aaa.txt",
+    tokenizer, ["1", "2"],
+    max_seq_length=32,
+    batch_size=1)
+for data in bert_dataloader.dataloader():
+    print(data)
--- a/tests/test_callbacks.py
+++ b/tests/test_callbacks.py
@@ -12,11 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# when test, you should add hapi root path to the PYTHONPATH,
+# export PYTHONPATH=PATH_TO_HAPI:$PYTHONPATH
 import unittest
 import time
 import random
-from callbacks import config_callbacks
+from hapi.callbacks import config_callbacks
 class TestCallbacks(unittest.TestCase):

--- a/tests/test_data/class_a/ILSVRC2012_val_00000293.JPEG
+++ b/tests/test_data/class_a/ILSVRC2012_val_00000293.JPEG
--- a/tests/test_data/class_a/ILSVRC2012_val_00002138.JPEG
+++ b/tests/test_data/class_a/ILSVRC2012_val_00002138.JPEG
--- a/tests/test_data/class_b/ILSVRC2012_val_00000236.JPEG
+++ b/tests/test_data/class_b/ILSVRC2012_val_00000236.JPEG
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# when test, you should add hapi root path to the PYTHONPATH,
+# export PYTHONPATH=PATH_TO_HAPI:$PYTHONPATH
+import unittest
+import numpy as np
+from hapi.datasets import *
+class TestFolderDatasets(unittest.TestCase):
+    def test_dataset(self):
+        dataset_folder = DatasetFolder('tests/test_data')
+        for _ in dataset_folder:
+            pass
+        assert len(dataset_folder) == 3
+        assert len(dataset_folder.classes) == 2
+class TestMNISTTest(unittest.TestCase):
+    def test_main(self):
+        mnist = MNIST(mode='test')
+        self.assertTrue(len(mnist) == 10000)
+        for i in range(len(mnist)):
+            image, label = mnist[i]
+            self.assertTrue(image.shape[0] == 784)
+            self.assertTrue(label.shape[0] == 1)
+            self.assertTrue(0 <= int(label) <= 9)
+class TestMNISTTrain(unittest.TestCase):
+    def test_main(self):
+        mnist = MNIST(mode='train')
+        self.assertTrue(len(mnist) == 60000)
+        for i in range(len(mnist)):
+            image, label = mnist[i]
+            self.assertTrue(image.shape[0] == 784)
+            self.assertTrue(label.shape[0] == 1)
+            self.assertTrue(0 <= int(label) <= 9)
+class TestFlowersTrain(unittest.TestCase):
+    def test_main(self):
+        flowers = Flowers(mode='train')
+        self.assertTrue(len(flowers) == 6149)
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 6149)
+        image, label = flowers[idx]
+        self.assertTrue(len(image.shape) == 3)
+        self.assertTrue(image.shape[2] == 3)
+        self.assertTrue(label.shape[0] == 1)
+class TestFlowersValid(unittest.TestCase):
+    def test_main(self):
+        flowers = Flowers(mode='valid')
+        self.assertTrue(len(flowers) == 1020)
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 1020)
+        image, label = flowers[idx]
+        self.assertTrue(len(image.shape) == 3)
+        self.assertTrue(image.shape[2] == 3)
+        self.assertTrue(label.shape[0] == 1)
+class TestFlowersTest(unittest.TestCase):
+    def test_main(self):
+        flowers = Flowers(mode='test')
+        self.assertTrue(len(flowers) == 1020)
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 1020)
+        image, label = flowers[idx]
+        self.assertTrue(len(image.shape) == 3)
+        self.assertTrue(image.shape[2] == 3)
+        self.assertTrue(label.shape[0] == 1)
+if __name__ == '__main__':
+    unittest.main()
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -15,24 +15,25 @@
 from __future__ import division
 from __future__ import print_function
+# when test, you should add hapi root path to the PYTHONPATH,
+# export PYTHONPATH=PATH_TO_HAPI:$PYTHONPATH
 import unittest
 import os
-import sys
-sys.path.append('../')
 import numpy as np
 import contextlib
 import paddle
 from paddle import fluid
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
-from model import Model, CrossEntropy, Input, Loss, set_device
+from paddle.io import BatchSampler, DataLoader
-from metrics import Accuracy
-from callbacks import ProgBarLogger
+from hapi.model import Model, CrossEntropy, Input, Loss, set_device
-from paddle.fluid.io import BatchSampler, DataLoader
+from hapi.metrics import Accuracy
-from paddle.fluid.io import MNIST as MnistDataset
+from hapi.callbacks import ProgBarLogger
+from hapi.datasets import MNIST as MnistDataset
 class SimpleImgConvPool(fluid.dygraph.Layer):

--- a/tests/test_progressbar.py
+++ b/tests/test_progressbar.py
@@ -12,11 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# when test, you should add hapi root path to the PYTHONPATH,
+# export PYTHONPATH=PATH_TO_HAPI:$PYTHONPATH
 import unittest
 import random
 import time
-from progressbar import ProgressBar
+from hapi.progressbar import ProgressBar
 class TestProgressBar(unittest.TestCase):

--- a/tests/test_transforms.py
+++ b/tests/test_transforms.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# when test, you should add hapi root path to the PYTHONPATH,
+# export PYTHONPATH=PATH_TO_HAPI:$PYTHONPATH
+import unittest
+from hapi.datasets import DatasetFolder
+import hapi.vision.transforms as transforms
+class TestTransforms(unittest.TestCase):
+    def do_transform(self, trans):
+        dataset_folder = DatasetFolder('tests/test_data', transform=trans)
+        for _ in dataset_folder:
+            pass
+    def test_trans0(self):
+        normalize = transforms.Normalize(
+            mean=[123.675, 116.28, 103.53], std=[58.395, 57.120, 57.375])
+        trans = transforms.Compose([
+            transforms.RandomResizedCrop(224), transforms.GaussianNoise(),
+            transforms.ColorJitter(
+                brightness=0.4, contrast=0.4, saturation=0.4,
+                hue=0.4), transforms.RandomHorizontalFlip(),
+            transforms.Permute(mode='CHW'), normalize
+        ])
+        self.do_transform(trans)
+    def test_trans1(self):
+        trans = transforms.Compose([
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+        ])
+        self.do_transform(trans)
+    def test_trans2(self):
+        trans = transforms.Compose([transforms.CenterCropResize(224)])
+        self.do_transform(trans)
+if __name__ == '__main__':
+    unittest.main()
--- a/transformer/predict.py
+++ b/transformer/predict.py
@@ -22,7 +22,7 @@ from functools import partial
 import numpy as np
 import paddle
 import paddle.fluid as fluid
-from paddle.fluid.io import DataLoader
+from paddle.io import DataLoader
 from paddle.fluid.layers.utils import flatten
 from utils.configure import PDConfig

--- a/transformer/reader.py
+++ b/transformer/reader.py
@@ -22,7 +22,7 @@ from functools import partial
 import numpy as np
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.parallel import ParallelEnv
-from paddle.fluid.io import BatchSampler, DataLoader, Dataset
+from paddle.io import BatchSampler, DataLoader, Dataset
 def create_data_loader(args, device):

--- a/transformer/train.py
+++ b/transformer/train.py
@@ -21,7 +21,7 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 import numpy as np
 import paddle
 import paddle.fluid as fluid
-from paddle.fluid.io import DataLoader
+from paddle.io import DataLoader
 from utils.configure import PDConfig
 from utils.check import check_gpu, check_version