Update to master and add more doc

ed6abbe1 · qingqing01 · 3dd26cff · 61e218d0 · ed6abbe1 · ed6abbe1
34 changed file
--- a/examples/bert/bert_classifier.py
+++ b/examples/bert/bert_classifier.py
@@ -18,10 +18,10 @@ from hapi.metrics import Accuracy
 from hapi.configure import Config
 from hapi.text.bert import BertEncoder
 from paddle.fluid.dygraph import Linear, Layer
-from hapi.model import set_device, Model, Input
 from hapi.loss import SoftmaxWithCrossEntropy
+from hapi.model import set_device, Model, Input
 import hapi.text.tokenizer.tokenization as tokenization
-from hapi.text.bert import Optimizer, BertConfig, BertDataLoader, BertInputExample
+from hapi.text.bert import BertConfig, BertDataLoader, BertInputExample, make_optimizer
 class ClsModelLayer(Model):
@@ -128,7 +128,7 @@ def main():
            [None, None], 'int64', name='src_ids'), Input(
                [None, None], 'int64', name='pos_ids'), Input(
                    [None, None], 'int64', name='sent_ids'), Input(
-                        [None, None], 'float32', name='input_mask')
+                        [None, None, 1], 'float32', name='input_mask')
    ]
    labels = [Input([None, 1], 'int64', name='label')]
@@ -139,13 +139,13 @@ def main():
        len(["contradiction", "entailment", "neutral"]),
        return_pooled_out=True)
-    optimizer = Optimizer(
+    optimizer = make_optimizer(
        warmup_steps=warmup_steps,
        num_train_steps=max_train_steps,
        learning_rate=config.learning_rate,
-        model_cls=cls_model,
        weight_decay=config.weight_decay,
        scheduler=config.lr_scheduler,
+        model=cls_model,
        loss_scaling=config.loss_scaling,
        parameter_list=cls_model.parameters())
@@ -157,8 +157,7 @@ def main():
        labels,
        device=device)
-    cls_model.bert_layer.init_parameters(
+    cls_model.bert_layer.load("./bert_small", reset_optimizer=True)
-        config.init_pretraining_params, verbose=config.verbose)
    # do train
    cls_model.fit(train_data=train_dataloader.dataloader,

--- a/examples/bert/run_classifier_single_gpu.sh
+++ b/examples/bert/run_classifier_single_gpu.sh
@@ -4,7 +4,7 @@ TASK_NAME='MNLI'
 DATA_PATH="./data/glue_data/MNLI/"
 CKPT_PATH="./data/saved_model/mnli_models"
-export CUDA_VISIBLE_DEVICES=0
+export CUDA_VISIBLE_DEVICES=1
 # start fine-tuning
 python3.7 bert_classifier.py\

--- a/examples/bert_leveldb/bert_classifier.py
+++ b/examples/bert_leveldb/bert_classifier.py
@@ -18,10 +18,10 @@ from hapi.metrics import Accuracy
 from hapi.configure import Config
 from hapi.text.bert import BertEncoder
 from paddle.fluid.dygraph import Linear, Layer
-from hapi.model import set_device, Model, Input
 from hapi.loss import SoftmaxWithCrossEntropy
+from hapi.model import set_device, Model, Input
 import hapi.text.tokenizer.tokenization as tokenization
-from hapi.text.bert import Optimizer, BertConfig, BertDataLoader, BertInputExample
+from hapi.text.bert import BertConfig, BertDataLoader, BertInputExample, make_optimizer
 class ClsModelLayer(Model):
@@ -99,12 +99,12 @@ def main():
    train_dataloader = BertDataLoader(
        "./data/glue_data/MNLI/train.tsv",
-        tokenizer, ["contradiction", "entailment", "neutral"],
+        tokenizer,
+        ["contradiction", "entailment", "neutral"],
        max_seq_length=config.max_seq_len,
        batch_size=config.batch_size,
        line_processor=mnli_line_processor,
-        mode="leveldb",
+        mode="leveldb", )
-        phase="train")
    test_dataloader = BertDataLoader(
        "./data/glue_data/MNLI/dev_matched.tsv",
@@ -130,7 +130,7 @@ def main():
            [None, None], 'int64', name='src_ids'), Input(
                [None, None], 'int64', name='pos_ids'), Input(
                    [None, None], 'int64', name='sent_ids'), Input(
-                        [None, None], 'float32', name='input_mask')
+                        [None, None, 1], 'float32', name='input_mask')
    ]
    labels = [Input([None, 1], 'int64', name='label')]
@@ -141,13 +141,13 @@ def main():
        len(["contradiction", "entailment", "neutral"]),
        return_pooled_out=True)
-    optimizer = Optimizer(
+    optimizer = make_optimizer(
        warmup_steps=warmup_steps,
        num_train_steps=max_train_steps,
        learning_rate=config.learning_rate,
-        model_cls=cls_model,
        weight_decay=config.weight_decay,
        scheduler=config.lr_scheduler,
+        model=cls_model,
        loss_scaling=config.loss_scaling,
        parameter_list=cls_model.parameters())
@@ -159,8 +159,7 @@ def main():
        labels,
        device=device)
-    cls_model.bert_layer.init_parameters(
+    cls_model.bert_layer.load("./bert_small", reset_optimizer=True)
-        config.init_pretraining_params, verbose=config.verbose)
    # do train
    cls_model.fit(train_data=train_dataloader.dataloader,

--- a/examples/bert_leveldb/run_classifier_multi_gpu.sh
+++ b/examples/bert_leveldb/run_classifier_multi_gpu.sh
@@ -5,7 +5,7 @@ DATA_PATH="./data/glue_data/MNLI/"
 CKPT_PATH="./data/saved_model/mnli_models"
 # start fine-tuning
-python3.7 -m paddle.distributed.launch --started_port 8899 --selected_gpus=0,1,2,3 bert_classifier.py\
+python3.7 -m paddle.distributed.launch --started_port 8899 --selected_gpus=1,2,3 bert_classifier.py\
    --use_cuda true \
    --do_train true \
    --do_test true \

--- a/examples/bert_leveldb/run_classifier_single_gpu.sh
+++ b/examples/bert_leveldb/run_classifier_single_gpu.sh
@@ -4,7 +4,7 @@ TASK_NAME='MNLI'
 DATA_PATH="./data/glue_data/MNLI/"
 CKPT_PATH="./data/saved_model/mnli_models"
-export CUDA_VISIBLE_DEVICES=0
+export CUDA_VISIBLE_DEVICES=1
 # start fine-tuning
 python3.7 bert_classifier.py\

--- a/examples/sentiment_classification/README.md
+++ b/examples/sentiment_classification/README.md
+## 简介
+情感是人类的一种高级智能行为，为了识别文本的情感倾向，需要深入的语义建模。另外，不同领域（如餐饮、体育）在情感的表达各不相同，因而需要有大规模覆盖各个领域的数据进行模型训练。为此，我们通过基于深度学习的语义模型和大规模数据挖掘解决上述两个问题。效果上，我们基于开源情感倾向分类数据集ChnSentiCorp进行评测。具体数据如下所示：
+| 模型 | dev | test |
+| :------| :------ | :------ |
+| CNN | 90.6% | 89.7% |
+| BOW | 90.1% | 90.3% |
+| GRU | 90.0% | 91.1% |
+| BIGRU | 89.7% |  89.6% |
+动态图文档请见[Dygraph](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/user_guides/howto/dygraph/DyGraph.html)
+## 快速开始
+本项目依赖于 Paddlepaddle 1.7.0 及以上版本，请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装。
+python版本依赖python 2.7或python 3.5及以上版本。
+#### 代码下载及环境变量设置
+克隆代码库到本地，并设置`PYTHONPATH`环境变量
+```shell
+git clone https://github.com/PaddlePaddle/hapi
+cd hapi
+export PYTHONPATH=$PYTHONPATH:`pwd`
+cd examples/sentiment_classification
+```
+#### 数据准备
+下载经过预处理的数据，文件解压之后，senta_data目录下会存在训练数据（train.tsv）、开发集数据（dev.tsv）、测试集数据（test.tsv）以及对应的词典（word_dict.txt）
+```shell
+wget https://baidu-nlp.bj.bcebos.com/sentiment_classification-dataset-1.0.0.tar.gz
+tar -zxvf sentiment_classification-dataset-1.0.0.tar.gz
+```
+#### 模型训练
+基于示例的数据集，可以运行下面的命令，在训练集（train.tsv）上进行模型训练，并在开发集（dev.tsv）验证。训练阶段需手动创建模型需要保存的文件夹，并且通过checkpoints设置保存文件路径。
+model_type从bow_net，cnn_net，gru_net，bigru_net中选择。
+模型相关参数均在`senta.yaml`中设置，模型训练需确保`senta.yaml`中`do_train`属性置为`True`。
+```shell
+python sentiment_classifier.py
+```
+#### 模型预测
+利用已有模型，可以运行下面命令，对未知label的数据（test.tsv）进行预测。
+模型预测需确保`senta.yaml`中`do_infer`属性置为`True`。
+```shell
+python sentiment_classifier.py
+```
+#### 模型参数
+模型参数配置文件：`senta.yaml`
+1. batch_size, 根据模型情况和GPU占用率选择batch_size, 建议cnn/bow选择较大batch_size, gru/bigru选择较小batch_size。
+2. padding_size默认为150。
+3. epoch, training时默认设置为5，infer默认为1。
+4. learning_rate默认为0.002。
+## 进阶使用
+#### 任务定义
+传统的情感分类主要基于词典或者特征工程的方式进行分类，这种方法需要繁琐的人工特征设计和先验知识，理解停留于浅层并且扩展泛化能力差。为了避免传统方法的局限，我们采用近年来飞速发展的深度学习技术。基于深度学习的情感分类不依赖于人工特征，它能够端到端的对输入文本进行语义理解，并基于语义表示进行情感倾向的判断。
+#### 模型原理介绍
+本项目针对情感倾向性分类问题，：
+ CNN（Convolutional Neural Networks），是一个基础的序列模型，能处理变长序列输入，提取局部区域之内的特征；
+ BOW（Bag Of Words）模型，是一个非序列模型，使用基本的全连接结构；
+ GRU（Gated Recurrent Unit），序列模型，能够较好地解决序列文本中长距离依赖的问题；
+ BI-GRU（Bidirectional Gated Recurrent Unit），序列模型，采用双向双层GRU结构，更好地捕获句子中的语义特征；
+#### 数据格式说明
+训练、预测、评估使用的数据可以由用户根据实际的应用场景，自己组织数据。数据由两列组成，以制表符分隔，第一列是以空格分词的中文文本（分词预处理方法将在下文具体说明），文件为utf8编码；第二列是情感倾向分类的类别（0表示消极；1表示积极），注意数据文件第一行固定表示为"text_a\tlabel"
+```text
+特 喜欢 这种 好看的 狗狗                  1
+这 真是 惊艳 世界 的 中国 黑科技           1
+环境 特别 差 ，脏兮兮 的，再也 不去 了      0
+```
+#### 代码结构说明
+```text
+.
+├── sentiment_classifier.py     # 该项目的主函数，封装包括训练、预测、评估的部分
+├── models.py                   # 网络结构
+```
--- a/examples/sentiment_classification/models.py
+++ b/examples/sentiment_classification/models.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.nn import Linear, Embedding
+from paddle.fluid.dygraph.base import to_variable
+import numpy as np
+from hapi.model import Model
+from hapi.text.text import GRUEncoderLayer as BiGRUEncoder
+from hapi.text.test import BOWEncoder, CNNEncoder, GRUEncoder
+class CNN(Model):
+    def __init__(self,  dict_dim, batch_size, seq_len):
+        super(CNN, self).__init__()
+        self.dict_dim = dict_dim
+        self.emb_dim = 128
+        self.hid_dim = 128
+        self.fc_hid_dim = 96
+        self.class_dim = 2
+        self.channels = 1
+        self.win_size = [3, self.hid_dim]
+        self.batch_size = batch_size
+        self.seq_len = seq_len
+        self._encoder = CNNEncoder(
+            dict_size=self.dict_dim + 1,
+            emb_dim=self.emb_dim,
+            seq_len=self.seq_len,
+            filter_size= self.win_size,
+            num_filters= self.hid_dim,
+            hidden_dim= self.hid_dim,
+            padding_idx=None,
+            act='tanh')
+        self._fc1 = Linear(input_dim = self.hid_dim*self.seq_len, output_dim=self.fc_hid_dim, act="softmax")
+        self._fc_prediction = Linear(input_dim = self.fc_hid_dim,
+                                 output_dim = self.class_dim,
+                                 act="softmax")
+    def forward(self, inputs):
+        conv_3 = self._encoder(inputs)
+        fc_1 = self._fc1(conv_3)
+        prediction = self._fc_prediction(fc_1)
+        return prediction
+class BOW(Model):
+    def __init__(self, dict_dim, batch_size, seq_len):
+        super(BOW, self).__init__()
+        self.dict_dim = dict_dim
+        self.emb_dim = 128
+        self.hid_dim = 128
+        self.fc_hid_dim = 96
+        self.class_dim = 2
+        self.batch_size = batch_size
+        self.seq_len = seq_len
+        self._encoder = BOWEncoder(
+            dict_size=self.dict_dim + 1,
+            emb_dim=self.emb_dim,
+            padding_idx=None,
+            bow_dim=self.hid_dim,
+            seq_len=self.seq_len)
+        self._fc1 = Linear(input_dim = self.hid_dim, output_dim=self.hid_dim, act="tanh")
+        self._fc2 = Linear(input_dim = self.hid_dim, output_dim=self.fc_hid_dim, act="tanh")
+        self._fc_prediction = Linear(input_dim = self.fc_hid_dim,
+                                 output_dim = self.class_dim,
+                                 act="softmax")
+    def forward(self, inputs):
+        bow_1 = self._encoder(inputs)
+        bow_1 = fluid.layers.tanh(bow_1)
+        fc_1 = self._fc1(bow_1)
+        fc_2 = self._fc2(fc_1)
+        prediction = self._fc_prediction(fc_2)
+        return prediction
+class GRU(Model):
+    def __init__(self, dict_dim, batch_size, seq_len):
+        super(GRU, self).__init__()
+        self.dict_dim = dict_dim
+        self.emb_dim = 128
+        self.hid_dim = 128
+        self.fc_hid_dim = 96
+        self.class_dim = 2
+        self.batch_size = batch_size
+        self.seq_len = seq_len
+        self._fc1 = Linear(input_dim=self.hid_dim, output_dim=self.fc_hid_dim, act="tanh")
+        self._fc_prediction = Linear(input_dim=self.fc_hid_dim,
+                                 output_dim=self.class_dim,
+                                 act="softmax")
+        self._encoder = GRUEncoder(
+            dict_size=self.dict_dim + 1,
+            emb_dim=self.emb_dim,
+            gru_dim=self.hid_dim,
+            hidden_dim=self.hid_dim,
+            padding_idx=None,
+            seq_len=self.seq_len)
+    def forward(self, inputs):
+        emb = self._encoder(inputs)
+        fc_1 = self._fc1(emb)
+        prediction = self._fc_prediction(fc_1)
+        return prediction
+class BiGRU(Model):
+    def __init__(self, dict_dim, batch_size, seq_len):
+        super(BiGRU, self).__init__()
+        self.dict_dim = dict_dim
+        self.emb_dim = 128
+        self.hid_dim = 128
+        self.fc_hid_dim = 96
+        self.class_dim = 2
+        self.batch_size = batch_size
+        self.seq_len = seq_len
+        self.embedding = Embedding(
+            size=[self.dict_dim + 1, self.emb_dim],
+            dtype='float32',
+            param_attr=fluid.ParamAttr(learning_rate=30),
+            is_sparse=False)
+        h_0 = np.zeros((self.batch_size, self.hid_dim), dtype="float32")
+        h_0 = to_variable(h_0)
+        self._fc1 = Linear(input_dim = self.hid_dim, output_dim=self.hid_dim*3)
+        self._fc2 = Linear(input_dim = self.hid_dim*2, output_dim=self.fc_hid_dim, act="tanh")
+        self._fc_prediction = Linear(input_dim=self.fc_hid_dim,
+                                 output_dim=self.class_dim,
+                                 act="softmax")
+        self._encoder = BiGRUEncoder(
+            grnn_hidden_dim=self.hid_dim,
+            input_dim=self.hid_dim * 3,
+            h_0=h_0,
+            init_bound=0.1,
+            is_bidirection=True)
+    def forward(self, inputs):
+        emb = self.embedding(inputs)
+        emb = fluid.layers.reshape(emb, shape=[self.batch_size, -1, self.hid_dim])
+        fc_1 = self._fc1(emb)
+        encoded_vector = self._encoder(fc_1)
+        encoded_vector = fluid.layers.tanh(encoded_vector)
+        encoded_vector = fluid.layers.reduce_max(encoded_vector, dim=1)
+        fc_2 = self._fc2(encoded_vector)
+        prediction = self._fc_prediction(fc_2)
+        return prediction
--- a/examples/sentiment_classification/senta.yaml
+++ b/examples/sentiment_classification/senta.yaml
+checkpoints: "./checkpoints"
+epoch: 5
+save_freq: 1
+eval_freq: 1
+lr: 0.002
+padding_size: 150
+skip_steps: 10
+verbose: False
+data_dir: "./senta_data/"
+vocab_path: "./senta_data/word_dict.txt"
+vocab_size: 33256
+batch_size: 20
+random_seed: 0
+use_cuda: True
+do_train: True
+do_infer: False
+model_type: "bow_net"
+output_dir: "./output"
--- a/examples/sentiment_classification/sentiment_classifier.py
+++ b/examples/sentiment_classification/sentiment_classifier.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Sentiment Classification in Paddle Dygraph Mode. """
+from __future__ import print_function
+import numpy as np
+import paddle.fluid as fluid
+from hapi.model import set_device, Model, CrossEntropy, Input
+from hapi.configure import Config
+from hapi.text.senta import SentaProcessor
+from hapi.metrics import Accuracy
+from models import CNN, BOW, GRU, BiGRU
+import json
+import os
+args = Config(yaml_file='./senta.yaml')
+args.build()
+args.Print()
+device = set_device("gpu" if args.use_cuda else "cpu")
+dev_count = fluid.core.get_cuda_device_count() if args.use_cuda else 1
+def main():
+    if args.do_train:
+        train()
+    elif args.do_infer:
+        infer()
+def train():
+    fluid.enable_dygraph(device)
+    processor = SentaProcessor(
+        data_dir=args.data_dir,
+        vocab_path=args.vocab_path,
+        random_seed=args.random_seed)
+    num_labels = len(processor.get_labels())
+    num_train_examples = processor.get_num_examples(phase="train")
+    max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count
+    train_data_generator = processor.data_generator(
+        batch_size=args.batch_size,
+        padding_size=args.padding_size,
+        places=device,
+        phase='train',
+        epoch=args.epoch,
+        shuffle=False)
+    eval_data_generator = processor.data_generator(
+        batch_size=args.batch_size,
+        padding_size=args.padding_size,
+        places=device,
+        phase='dev',
+        epoch=args.epoch,
+        shuffle=False)
+    if args.model_type == 'cnn_net':
+        model = CNN( args.vocab_size, args.batch_size,
+                     args.padding_size)
+    elif args.model_type == 'bow_net':
+        model = BOW( args.vocab_size, args.batch_size,
+                     args.padding_size)
+    elif args.model_type == 'gru_net':
+        model = GRU( args.vocab_size, args.batch_size,
+                     args.padding_size)
+    elif args.model_type == 'bigru_net':
+        model = BiGRU( args.vocab_size, args.batch_size,
+                       args.padding_size)
+    optimizer = fluid.optimizer.Adagrad(learning_rate=args.lr, parameter_list=model.parameters()) 
+    inputs = [Input([None, None], 'int64', name='doc')]
+    labels = [Input([None, 1], 'int64', name='label')]
+    model.prepare(
+        optimizer,
+        CrossEntropy(),
+        Accuracy(topk=(1,)),
+        inputs,
+        labels,
+        device=device)
+    model.fit(train_data=train_data_generator,
+              eval_data=eval_data_generator,
+              batch_size=args.batch_size,
+              epochs=args.epoch,
+              save_dir=args.checkpoints,
+              eval_freq=args.eval_freq,
+              save_freq=args.save_freq)
+def infer():
+    fluid.enable_dygraph(device)
+    processor = SentaProcessor(
+        data_dir=args.data_dir,
+        vocab_path=args.vocab_path,
+        random_seed=args.random_seed)
+    infer_data_generator = processor.data_generator(
+        batch_size=args.batch_size,
+        padding_size=args.padding_size,
+        places=device,
+        phase='infer',
+        epoch=1,
+        shuffle=False)
+    if args.model_type == 'cnn_net':
+        model_infer = CNN( args.vocab_size, args.batch_size,
+                           args.padding_size)
+    elif args.model_type == 'bow_net':
+        model_infer = BOW( args.vocab_size, args.batch_size,
+                           args.padding_size)
+    elif args.model_type == 'gru_net':
+        model_infer = GRU( args.vocab_size, args.batch_size,
+                           args.padding_size)
+    elif args.model_type == 'bigru_net':
+        model_infer = BiGRU( args.vocab_size, args.batch_size,
+                             args.padding_size)
+    print('Do inferring ...... ')
+    inputs = [Input([None, None], 'int64', name='doc')]
+    model_infer.prepare(
+        None,
+        CrossEntropy(),
+        Accuracy(topk=(1,)),
+        inputs,
+        device=device)
+    model_infer.load(args.checkpoints, reset_optimizer=True)
+    preds = model_infer.predict(test_data=infer_data_generator)
+    preds = np.array(preds[0]).reshape((-1, 2))
+    if args.output_dir:
+        with open(os.path.join(args.output_dir, 'predictions.json'), 'w') as w:
+            for p in range(len(preds)):
+                label = np.argmax(preds[p])
+                result = json.dumps({'index': p, 'label': label, 'probs': preds[p].tolist()})
+                w.write(result+'\n')
+        print('Predictions saved at '+os.path.join(args.output_dir, 'predictions.json'))
+if __name__ == '__main__':
+    main()
--- a/examples/tsm/README.md
+++ b/examples/tsm/README.md
@@ -39,8 +39,8 @@ TSM模型是将Temporal Shift Module插入到ResNet网络中构建的视频分
    ```bash
    git clone https://github.com/PaddlePaddle/hapi
    cd hapi
-    export PYTHONPATH=$PYTHONPATH:`pwd`
+    export PYTHONPATH=`pwd`:$PYTHONPATH
-    cd tsm
+    cd examples/tsm
    ```
 ### 数据准备
@@ -141,6 +141,8 @@ python infer.py --data=<path/to/dataset> --label_list=<path/to/label_list> --inf
 2020-04-03 07:37:16,321-INFO: Sample ./kineteics/val_10/data_batch_10-042_6 predict label: 6, ground truth label: 6
 ```
+**注意：** 推断时`--infer_file`需要指定到pickle文件路径。
 ## 参考论文
 - [Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/abs/1811.08383v1), Ji Lin, Chuang Gan, Song Han

--- a/examples/tsm/infer.py
+++ b/examples/tsm/infer.py
@@ -26,6 +26,7 @@ from check import check_gpu, check_version
 from modeling import tsm_resnet50
 from kinetics_dataset import KineticsDataset
 from transforms import *
+from utils import print_arguments
 import logging
 logger = logging.getLogger(__name__)
@@ -56,7 +57,7 @@ def main():
        model.load(FLAGS.weights, reset_optimizer=True)
    imgs, label = dataset[0]
-    pred = model.test([imgs[np.newaxis, :]])
+    pred = model.test_batch([imgs[np.newaxis, :]])
    pred = labels[np.argmax(pred)]
    logger.info("Sample {} predict label: {}, ground truth label: {}" \
                .format(FLAGS.infer_file, pred, labels[int(label)]))
@@ -86,6 +87,7 @@ if __name__ == '__main__':
        type=str,
        help="weights path for evaluation")
    FLAGS = parser.parse_args()
+    print_arguments(FLAGS)
    check_gpu(str.lower(FLAGS.device) == 'gpu')
    check_version()

--- a/examples/tsm/kinetics_dataset.py
+++ b/examples/tsm/kinetics_dataset.py
@@ -113,7 +113,7 @@ class KineticsDataset(Dataset):
        if self.transform:
            imgs, label = self.transform(imgs, label)
-        return imgs, np.array([label])
+        return imgs, np.array([label]).astype('int64')
    @property
    def num_classes(self):

--- a/examples/tsm/main.py
+++ b/examples/tsm/main.py
@@ -31,6 +31,7 @@ from modeling import tsm_resnet50
 from check import check_gpu, check_version
 from kinetics_dataset import KineticsDataset
 from transforms import *
+from utils import print_arguments
 def make_optimizer(step_per_epoch, parameter_list=None):
@@ -106,7 +107,7 @@ def main():
              eval_data=val_dataset,
              epochs=FLAGS.epoch,
              batch_size=FLAGS.batch_size,
-              save_dir='tsm_checkpoint',
+              save_dir=FLAGS.save_dir or 'tsm_checkpoint',
              num_workers=FLAGS.num_workers,
              drop_last=True,
              shuffle=True)
@@ -150,7 +151,14 @@ if __name__ == '__main__':
        default=None,
        type=str,
        help="weights path for evaluation")
+    parser.add_argument(
+        "-s",
+        "--save_dir",
+        default=None,
+        type=str,
+        help="directory path for checkpoint saving, default ./yolo_checkpoint")
    FLAGS = parser.parse_args()
+    print_arguments(FLAGS)
    check_gpu(str.lower(FLAGS.device) == 'gpu')
    check_version()

--- a/examples/tsm/utils.py
+++ b/examples/tsm/utils.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import six
+import logging
+logger = logging.getLogger(__name__)
+__all__ = ['print_ar']
+def print_arguments(args):
+    """Print argparse's arguments.
+    Usage:
+    .. code-block:: python
+        parser = argparse.ArgumentParser()
+        parser.add_argument("name", default="Jonh", type=str, help="User name.")
+        args = parser.parse_args()
+        print_arguments(args)
+    :param args: Input argparse.Namespace for printing.
+    :type args: argparse.Namespace
+    """
+    logger.info("-----------  Configuration Arguments -----------")
+    for arg, value in sorted(six.iteritems(vars(args))):
+        logger.info("%s: %s" % (arg, value))
+    logger.info("------------------------------------------------")
--- a/examples/yolov3/README.md
+++ b/examples/yolov3/README.md
@@ -53,8 +53,8 @@ YOLOv3 的网络结构由基础特征提取网络、multi-scale特征融合层
    ```bash
    git clone https://github.com/PaddlePaddle/hapi
    cd hapi
-    export PYTHONPATH=$PYTHONPATH:`pwd`
+    export PYTHONPATH=`pwd`:$PYTHONPATH
-    cd tsm
+    cd examples/yolov3
    ```
 #### 安装COCO-API
@@ -126,13 +126,13 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python -m paddle.distributed.launch main.py --data=
 使用如下方式进行多卡训练:
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 python main.py -m paddle.distributed.launch --data=<path/to/dataset> --batch_size=16 -d
+CUDA_VISIBLE_DEVICES=0,1,2,3 python -m paddle.distributed.launch main.py --data=<path/to/dataset> --batch_size=16 -d
 ```
 ### 模型评估
-YOLOv3模型输出为LoDTensor，只支持使用batch_size为1进行评估，可通过如下两种方式进行模型评估。
+YOLOv3模型输出为LoDTensor，只支持使用单卡且batch_size为1进行评估，可通过如下两种方式进行模型评估。
 1. 自动下载Paddle发布的[YOLOv3-DarkNet53](https://paddlemodels.bj.bcebos.com/hapi/yolov3_darknet53.pdparams)权重评估
@@ -180,7 +180,7 @@ python infer.py --label_list=dataset/voc/label_list.txt --infer_image=image/dog.
 2. 加载checkpoint进行精度评估
 ```bash
-python infer.py --label_list=dataset/voc/label_list.txt --infer_image=image/dog.jpg --weights=yolo_checkpoint/mo_mixup/final
+python infer.py --label_list=dataset/voc/label_list.txt --infer_image=image/dog.jpg --weights=yolo_checkpoint/no_mixup/final
 ```
 推断结果可视化图像会保存于`--output`指定的文件夹下，默认保存于`./output`目录。

--- a/examples/yolov3/infer.py
+++ b/examples/yolov3/infer.py
@@ -28,7 +28,7 @@ from hapi.model import Model, Input, set_device
 from modeling import yolov3_darknet53, YoloLoss
 from transforms import *
+from utils import print_arguments
 from visualizer import draw_bbox
 import logging
@@ -91,7 +91,7 @@ def main():
    img_id = np.array([0]).astype('int64')[np.newaxis, :]
    img_shape = np.array([h, w]).astype('int32')[np.newaxis, :]
-    _, bboxes = model.test([img_id, img_shape, img])
+    _, bboxes = model.test_batch([img_id, img_shape, img])
    vis_img = draw_bbox(orig_img, cat2name, bboxes, FLAGS.draw_threshold)
    save_name = get_save_image_name(FLAGS.output_dir, FLAGS.infer_image)
@@ -121,6 +121,7 @@ if __name__ == '__main__':
        "-w", "--weights", default=None, type=str,
        help="path to weights for inference")
    FLAGS = parser.parse_args()
+    print_arguments(FLAGS)
    assert os.path.isfile(FLAGS.infer_image), \
            "infer_image {} not a file".format(FLAGS.infer_image)
    assert os.path.isfile(FLAGS.label_list), \

--- a/examples/yolov3/main.py
+++ b/examples/yolov3/main.py
@@ -33,6 +33,7 @@ from modeling import yolov3_darknet53, YoloLoss
 from coco import COCODataset
 from coco_metric import COCOMetric
 from transforms import *
+from utils import print_arguments
 NUM_MAX_BOXES = 50
@@ -171,16 +172,18 @@ def main():
    if FLAGS.resume is not None:
        model.load(FLAGS.resume)
+    save_dir = FLAGS.save_dir or 'yolo_checkpoint'
    model.fit(train_data=loader,
              epochs=FLAGS.epoch - FLAGS.no_mixup_epoch,
-              save_dir="yolo_checkpoint/mixup",
+              save_dir=os.path.join(save_dir, "mixup"),
              save_freq=10)
    # do not use image mixup transfrom in the last FLAGS.no_mixup_epoch epoches
    dataset.mixup = False
    model.fit(train_data=loader,
              epochs=FLAGS.no_mixup_epoch,
-              save_dir="yolo_checkpoint/no_mixup",
+              save_dir=os.path.join(save_dir, "no_mixup"),
              save_freq=5)
@@ -233,6 +236,13 @@ if __name__ == '__main__':
        default=None,
        type=str,
        help="path to weights for evaluation")
+    parser.add_argument(
+        "-s",
+        "--save_dir",
+        default=None,
+        type=str,
+        help="directory path for checkpoint saving, default ./yolo_checkpoint")
    FLAGS = parser.parse_args()
+    print_arguments(FLAGS)
    assert FLAGS.data, "error: must provide data path"
    main()
--- a/examples/yolov3/utils.py
+++ b/examples/yolov3/utils.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import six
+import logging
+logger = logging.getLogger(__name__)
+__all__ = ['print_ar']
+def print_arguments(args):
+    """Print argparse's arguments.
+    Usage:
+    .. code-block:: python
+        parser = argparse.ArgumentParser()
+        parser.add_argument("name", default="Jonh", type=str, help="User name.")
+        args = parser.parse_args()
+        print_arguments(args)
+    :param args: Input argparse.Namespace for printing.
+    :type args: argparse.Namespace
+    """
+    logger.info("-----------  Configuration Arguments -----------")
+    for arg, value in sorted(six.iteritems(vars(args))):
+        logger.info("%s: %s" % (arg, value))
+    logger.info("------------------------------------------------")
--- a/hapi/datasets/flowers.py
+++ b/hapi/datasets/flowers.py
@@ -123,7 +123,7 @@ class Flowers(Dataset):
        if self.transform is not None:
            image = self.transform(image)
-        return image, label
+        return image, label.astype('int64')
    def __len__(self):
        return len(self.indexes)
--- a/hapi/datasets/mnist.py
+++ b/hapi/datasets/mnist.py
@@ -45,6 +45,8 @@ class MNIST(Dataset):
            :attr:`download` is True. Default None
        label_path(str): path to label file, can be set None if
            :attr:`download` is True. Default None
+        chw_format(bool): If set True, the output shape is [1, 28, 28],
+            otherwise, output shape is [1, 784].
        mode(str): 'train' or 'test' mode. Default 'train'.
        download(bool): whether auto download mnist dataset if
            :attr:`image_path`/:attr:`label_path` unset. Default
@@ -70,13 +72,14 @@ class MNIST(Dataset):
    def __init__(self,
                 image_path=None,
                 label_path=None,
+                 chw_format=True,
                 mode='train',
                 transform=None,
                 download=True):
        assert mode.lower() in ['train', 'test'], \
                "mode should be 'train' or 'test', but got {}".format(mode)
        self.mode = mode.lower()
+        self.chw_format = chw_format
        self.image_path = image_path
        if self.image_path is None:
            assert download, "image_path not set and auto download disabled"
@@ -144,10 +147,13 @@ class MNIST(Dataset):
                    for i in range(buffer_size):
                        self.images.append(images[i, :])
-                        self.labels.append(np.array([labels[i]]))
+                        self.labels.append(
+                            np.array([labels[i]]).astype('int64'))
    def __getitem__(self, idx):
        image, label = self.images[idx], self.labels[idx]
+        if self.chw_format:
+            image = np.reshape(image, [1, 28, 28])
        if self.transform is not None:
            image = self.transform(image)
        return image, label

--- a/hapi/download.py
+++ b/hapi/download.py
@@ -23,6 +23,7 @@ import requests
 import tqdm
 import hashlib
 import time
+from collections import OrderedDict
 from paddle.fluid.dygraph.parallel import ParallelEnv
@@ -35,6 +36,26 @@ WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/hapi/weights")
 DOWNLOAD_RETRY_LIMIT = 3
+nlp_models = OrderedDict(
+            (('RoBERTa-zh-base', 'https://bert-models.bj.bcebos.com/chinese_roberta_wwm_ext_L-12_H-768_A-12.tar.gz'),
+            ('RoBERTa-zh-large', 'https://bert-models.bj.bcebos.com/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.tar.gz'),
+            ('ERNIE-v2-en-base', 'https://ernie.bj.bcebos.com/ERNIE_Base_en_stable-2.0.0.tar.gz'),
+            ('ERNIE-v2-en-large', 'https://ernie.bj.bcebos.com/ERNIE_Large_en_stable-2.0.0.tar.gz'),
+            ('XLNet-cased-base', 'https://xlnet.bj.bcebos.com/xlnet_cased_L-12_H-768_A-12.tgz'),
+            ('XLNet-cased-large', 'https://xlnet.bj.bcebos.com/xlnet_cased_L-24_H-1024_A-16.tgz'),
+            ('ERNIE-v1-zh-base', 'https://baidu-nlp.bj.bcebos.com/ERNIE_stable-1.0.1.tar.gz'),
+            ('ERNIE-v1-zh-base-max-len-512', 'https://ernie.bj.bcebos.com/ERNIE_1.0_max-len-512.tar.gz'),
+            ('BERT-en-uncased-large-whole-word-masking', 'https://bert-models.bj.bcebos.com/wwm_uncased_L-24_H-1024_A-16.tar.gz'),
+            ('BERT-en-cased-large-whole-word-masking', 'https://bert-models.bj.bcebos.com/wwm_cased_L-24_H-1024_A-16.tar.gz'),
+            ('BERT-en-uncased-base', 'https://bert-models.bj.bcebos.com/uncased_L-12_H-768_A-12.tar.gz'),
+            ('BERT-en-uncased-large', 'https://bert-models.bj.bcebos.com/uncased_L-24_H-1024_A-16.tar.gz'),
+            ('BERT-en-cased-base', 'https://bert-models.bj.bcebos.com/cased_L-12_H-768_A-12.tar.gz'),
+            ('BERT-en-cased-large','https://bert-models.bj.bcebos.com/cased_L-24_H-1024_A-16.tar.gz'),
+            ('BERT-multilingual-uncased-base', 'https://bert-models.bj.bcebos.com/multilingual_L-12_H-768_A-12.tar.gz'),
+            ('BERT-multilingual-cased-base', 'https://bert-models.bj.bcebos.com/multi_cased_L-12_H-768_A-12.tar.gz'),
+            ('BERT-zh-base', 'https://bert-models.bj.bcebos.com/chinese_L-12_H-768_A-12.tar.gz'),)
+            )
 def is_url(path):
    """

--- a/hapi/metrics.py
+++ b/hapi/metrics.py
@@ -116,7 +116,7 @@ class Accuracy(Metric):
    def add_metric_op(self, pred, label, *args):
        pred = fluid.layers.argsort(pred, descending=True)[1][:, :self.maxk]
        correct = pred == label
-        return correct
+        return fluid.layers.cast(correct, dtype='float32')
    def update(self, correct, *args):
        accs = []
@@ -143,7 +143,7 @@ class Accuracy(Metric):
        if self.maxk != 1:
            self._name = ['{}_top{}'.format(name, k) for k in self.topk]
        else:
-            self._name = ['acc']
+            self._name = [name]
    def name(self):
        return self._name
--- a/hapi/model.py
+++ b/hapi/model.py
@@ -642,7 +642,7 @@ class Model(fluid.dygraph.Layer):
    An Model object is network with training and inference features.
    Dynamic graph and static graph are supported at the same time,
    switched by `fluid.enable_dygraph()`. The usage is as follows.
-    The switching between dynamic and static should be before
+    But note, the switching between dynamic and static should be before
    instantiating a Model. The input description, i.e, hapi.Input,
    must be required for static graph.
@@ -993,8 +993,25 @@ class Model(fluid.dygraph.Layer):
        Returns a list of parameters of the model.
        Returns:
-            list of :ref:`api_guide_Variable_en` : a list of parameters.
+            list of Parameter in static graph.
+            list of ParamBase in dynamic graph.
+        Examples:
+            .. code-block:: python
+              from hapi.model import Model, Input, set_device
+              class MyModel(Model):
+                  def __init__(self):
+                      super(MyModel, self).__init__()
+                      self._fc = fluid.dygraph.Linear(20, 10, act='softmax')
+                  def forward(self, x):
+                      y = self._fc(x)
+                      return y
+              fluid.enable_dygraph()
+              model = MyModel()
+              params = model.parameters()
        """
        return self._adapter.parameters()
@@ -1006,27 +1023,32 @@ class Model(fluid.dygraph.Layer):
                labels=None,
                device=None):
        """
-        FIXME: add comments
+        Configures the model before runing.
        Args:
-            optimizer (Optimizer|None): optimizer must be set in training
+            optimizer (Optimizer|None): Optimizer must be set in training
                and should be a Optimizer instance. It can be None in eval
                and test mode.
-            loss_function (Loss|None): loss function must be set in training
+            loss_function (Loss|None): Loss function must be set in training
                and should be a Loss instance. It can be None when there is
                no loss.
-            metrics (Metric|list of Metric|None): if metrics is set, all
+            metrics (Metric|list of Metric|None): If metrics is set, all
-                metric will be calculate and output in train/eval mode.
+                metrics will be calculated and output in train/eval mode.
            inputs (Input|list|dict|None): inputs, entry points of network,
                could be a Input layer, or lits of Input layers,
                or dict (name: Input), or None. For static graph,
                inputs must be set. For dynamic graph, it could be None.
            labels (Input|list|None): labels, entry points of network,
                could be a Input layer or lits of Input layers, or None.
-                For static graph, if set loss_function in Model.prepare(), it
+                For static graph, if labels is required in loss_function,
-                must be set. Otherwise, it could be None.
+                labels must be set. Otherwise, it could be None.
-            device (str|None): specify device type, 'CPU' or 'GPU'.
+            device (str|fluid.CUDAPlace|fluid.CPUPlace|None): specify device
+                type, 'CPU', 'GPU', fluid.CUDAPlace or fluid.CPUPlace.
                If None, automatically select device according to
                installation package version.
+        Returns:
+            None
        """
        if isinstance(device, fluid.CUDAPlace) or \
@@ -1108,7 +1130,9 @@ class Model(fluid.dygraph.Layer):
            num_workers=0,
            callbacks=None, ):
        """
-        FIXME: add more comments and usage
+        Trains the model for a fixed number of epochs. If `eval_data` is set,
+        evaluation will be done at the end of each epoch.
        Args:
            train_data (Dataset|DataLoader): An iterable data loader is used for 
                train. An instance of paddle paddle.io.Dataset or 
@@ -1141,6 +1165,87 @@ class Model(fluid.dygraph.Layer):
            callbacks (Callback|None): A list of `Callback` instances to apply
                during training. If None, `ProgBarLogger` and `ModelCheckpoint`
                are automatically inserted. Default: None.
+        Returns:
+            None
+        Examples:
+            1. An example use Dataset and set btch size, shuffle in fit.
+               How to make a batch is done internally.
+            .. code-block:: python
+              from hapi.model import Model, Input, set_device
+              from hapi.loss import CrossEntropy
+              from hapi.metrics import Accuracy
+              from hapi.datasets import MNIST
+              from hapi.vision.models import LeNet
+              dynamic = True
+              device = set_device(FLAGS.device)
+              fluid.enable_dygraph(device) if dynamic else None
+              train_dataset = MNIST(mode='train')
+              val_dataset = MNIST(mode='test')
+              inputs = [Input([None, 1, 28, 28], 'float32', name='image')]
+              labels = [Input([None, 1], 'int64', name='label')]
+              model = LeNet()
+              optim = fluid.optimizer.Adam(
+                  learning_rate=0.001, parameter_list=model.parameters())
+              model.prepare(
+                  optim,
+                  CrossEntropy(),
+                  Accuracy(topk=(1, 2)),
+                  inputs=inputs,
+                  labels=labels,
+                  device=device)
+              model.fit(train_dataset,
+                        val_dataset,
+                        epochs=2,
+                        batch_size=64,
+                        save_dir='mnist_checkpoint')
+            2. An example use DataLoader, batch size and shuffle is set in
+               DataLoader.
+            .. code-block:: python
+              from hapi.model import Model, Input, set_device
+              from hapi.loss import CrossEntropy
+              from hapi.metrics import Accuracy
+              from hapi.datasets import MNIST
+              from hapi.vision.models import LeNet
+              dynamic = True
+              device = set_device(FLAGS.device)
+              fluid.enable_dygraph(device) if dynamic else None
+              train_dataset = MNIST(mode='train')
+              train_loader = fluid.io.DataLoader(train_dataset,
+                  places=device, batch_size=64)
+              val_dataset = MNIST(mode='test')
+              val_loader = fluid.io.DataLoader(val_dataset,
+                  places=device, batch_size=64)
+              inputs = [Input([None, 1, 28, 28], 'float32', name='image')]
+              labels = [Input([None, 1], 'int64', name='label')]
+              model = LeNet()
+              optim = fluid.optimizer.Adam(
+                  learning_rate=0.001, parameter_list=model.parameters())
+              model.prepare(
+                  optim,
+                  CrossEntropy(),
+                  Accuracy(topk=(1, 2)),
+                  inputs=inputs,
+                  labels=labels,
+                  device=device)
+              model.fit(train_loader,
+                        val_loader,
+                        epochs=2,
+                        save_dir='mnist_checkpoint')
        """
        assert train_data is not None, \
@@ -1235,26 +1340,29 @@ class Model(fluid.dygraph.Layer):
            num_workers=0,
            callbacks=None, ):
        """
-        FIXME: add more comments and usage
+        Evaluate the loss and metrics of the model on input dataset.
        Args:
            eval_data (Dataset|DataLoader): An iterable data loader is used for
                evaluation. An instance of paddle.io.Dataset or 
                paddle.io.Dataloader is recomended.
-            batch_size (int): Integer number. The batch size of train_data and eval_data. 
+            batch_size (int): Integer number. The batch size of train_data
-                When eval_data is the instance of Dataloader, this argument will be ignored.
+                and eval_data.  When eval_data is the instance of Dataloader,
-                Default: 1.
+                this argument will be ignored. Default: 1.
            log_freq (int): The frequency, in number of steps, the eval logs
                are printed. Default: 10.
-            verbose (int): The verbosity mode, should be 0, 1, or 2.
+            verbose (int): The verbosity mode, should be 0, 1, or 2. 0 = silent,
-                0 = silent, 1 = progress bar, 2 = one line per epoch. Default: 2.
+                1 = progress bar, 2 = one line per epoch. Default: 2.
-            num_workers (int): The number of subprocess to load data, 0 for no subprocess 
+            num_workers (int): The number of subprocess to load data,
-                used and loading data in main process. When train_data and eval_data are
+                0 for no subprocess used and loading data in main process. When
-                both the instance of Dataloader, this parameter will be ignored. Default: 0.
+                train_data and eval_data are both the instance of Dataloader,
+                this parameter will be ignored. Default: 0.
            callbacks (Callback|None): A list of `Callback` instances to apply
                during training. If None, `ProgBarLogger` and `ModelCheckpoint`
                are automatically inserted. Default: None.
        Returns:
-            dict: Result of metric.
+            dict: Result of metric. The key is the names of Metric,
+                value is a scalar or numpy.array.
        """
        if fluid.in_dygraph_mode():
@@ -1312,7 +1420,8 @@ class Model(fluid.dygraph.Layer):
                num_workers=0,
                stack_outputs=False):
        """
-        FIXME: add more comments and usage
+        Compute the output predictions on testing data.
        Args:
            test_data (Dataset|DataLoader): An iterable data loader is used for
                predict. An instance of paddle.io.Dataset or paddle.io.Dataloader 
@@ -1387,21 +1496,20 @@ class Model(fluid.dygraph.Layer):
                             save_dir,
                             model_filename=None,
                             params_filename=None,
-                             program_only=False):
+                             model_only=False):
        """
        Save inference model must in static mode.
        Args:
            dirname(str): The directory path to save the inference model.
-            model_filename(str|None): The name of file to save the inference program
+            model_filename(str|None): The name of file to save the inference
-                                        itself. If is set None, a default filename
+                model itself. If is set None, a default filename
-                                        :code:`__model__` will be used.
+                :code:`__model__` will be used.
-            params_filename(str|None): The name of file to save all related parameters.
+            params_filename(str|None): The name of file to save all related
-                                            If it is set None, parameters will be saved
+                parameters. If it is set None, parameters will be saved
-                                            in separate files .
+                in separate files .
-            program_only(bool): If True, It will save inference program only, and do not 
+            model_only(bool): If True, It will save inference model only,
-                                        save params of Program.
+                and do not save parameters. Default: False.
-                                        Default: False.
        Returns:
            list: The fetch variables' name list
@@ -1426,7 +1534,7 @@ class Model(fluid.dygraph.Layer):
            main_program=infer_prog,
            model_filename=model_filename,
            params_filename=params_filename,
-            program_only=program_only)
+            program_only=model_only)
    def _run_one_epoch(self,
                       data_loader,

--- a/hapi/tests/test_metrics.py
+++ b/hapi/tests/test_metrics.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import division
+from __future__ import print_function
+import os
+import unittest
+import numpy as np
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.base import to_variable
+from hapi.metrics import *
+from hapi.utils import to_list
+def accuracy(pred, label, topk=(1, )):
+    maxk = max(topk)
+    pred = np.argsort(pred)[:, ::-1][:, :maxk]
+    correct = (pred == np.repeat(label, maxk, 1))
+    batch_size = label.shape[0]
+    res = []
+    for k in topk:
+        correct_k = correct[:, :k].sum()
+        res.append(correct_k / batch_size)
+    return res
+def convert_to_one_hot(y, C):
+    oh = np.random.random((y.shape[0], C)).astype('float32') * .5
+    for i in range(y.shape[0]):
+        oh[i, int(y[i])] = 1.
+    return oh
+class TestAccuracyDynamic(unittest.TestCase):
+    def setUp(self):
+        self.topk = (1, )
+        self.class_num = 5
+        self.sample_num = 1000
+        self.name = None
+    def random_pred_label(self):
+        label = np.random.randint(0, self.class_num, (self.sample_num, 1)).astype('int64')
+        pred = np.random.randint(0, self.class_num, (self.sample_num, 1)).astype('int32')
+        pred_one_hot = convert_to_one_hot(pred, self.class_num)
+        pred_one_hot = pred_one_hot.astype('float32')
+        return label, pred_one_hot
+    def test_main(self):
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            acc = Accuracy(topk=self.topk, name=self.name)
+            for i in range(10):
+                label, pred = self.random_pred_label()
+                label_var = to_variable(label)
+                pred_var = to_variable(pred)
+                state = to_list(acc.add_metric_op(pred_var, label_var))
+                acc.update(*[s.numpy() for s in state])
+                res_m = acc.accumulate()
+                res_f = accuracy(pred, label, self.topk)
+                assert np.all(np.isclose(np.array(res_m), np.array(res_f), rtol=1e-3)), \
+                        "Accuracy precision error: {} != {}".format(res_m, res_f)
+                acc.reset()
+                assert np.sum(acc.total) == 0
+                assert np.sum(acc.count) == 0
+class TestAccuracyDynamicMultiTopk(TestAccuracyDynamic):
+    def setUp(self):
+        self.topk = (1, 5)
+        self.class_num = 10
+        self.sample_num = 1000
+        self.name = "accuracy"
+class TestAccuracyStatic(TestAccuracyDynamic):
+    def test_main(self):
+        main_prog = fluid.Program()
+        startup_prog = fluid.Program()
+        with fluid.program_guard(main_prog, startup_prog):
+            pred = fluid.data(name='pred', shape=[None, self.class_num], dtype='float32')
+            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+            acc = Accuracy(topk=self.topk, name=self.name)
+            state = acc.add_metric_op(pred, label)
+        exe = fluid.Executor(fluid.CPUPlace())
+        compiled_main_prog = fluid.CompiledProgram(main_prog)
+        for i in range(10):
+            label, pred = self.random_pred_label()
+            state_ret = exe.run(compiled_main_prog,
+                                feed={'pred': pred, 'label': label},
+                                fetch_list=[s.name for s in to_list(state)],
+                                return_numpy=True)
+            acc.update(*state_ret)
+            res_m = acc.accumulate()
+            res_f = accuracy(pred, label, self.topk)
+            assert np.all(np.isclose(np.array(res_m), np.array(res_f), rtol=1e-3)), \
+                    "Accuracy precision error: {} != {}".format(res_m, res_f)
+            acc.reset()
+            assert np.sum(acc.total) == 0
+            assert np.sum(acc.count) == 0
+class TestAccuracyStaticMultiTopk(TestAccuracyStatic):
+    def setUp(self):
+        self.topk = (1, 5)
+        self.class_num = 10
+        self.sample_num = 1000
+        self.name = "accuracy"
+if __name__ == '__main__':
+    unittest.main()
--- a/hapi/tests/test_model.py
+++ b/hapi/tests/test_model.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import unittest
+import os
 import numpy as np
 import shutil
 import tempfile
@@ -72,7 +73,8 @@ class MnistDataset(MNIST):
            self.labels = self.labels[:sample_num]
    def __getitem__(self, idx):
-        img = np.reshape(self.images[idx], [1, 28, 28])
+        img, label = self.images[idx], self.labels[idx]
+        img = np.reshape(img, [1, 28, 28])
        if self.return_label:
            return img, np.array(self.labels[idx]).astype('int64')
        return img,
@@ -141,34 +143,61 @@ class TestModel(unittest.TestCase):
        cls.init_param = dy_lenet.state_dict()
        dynamic_train(dy_lenet, cls.train_loader)
-        cls.trained_param = dy_lenet.state_dict()
        cls.acc1 = dynamic_evaluate(dy_lenet, cls.val_loader)
        cls.inputs = [Input([-1, 1, 28, 28], 'float32', name='image')]
        cls.labels = [Input([None, 1], 'int64', name='label')]
+        cls.save_dir = tempfile.mkdtemp()
+        cls.weight_path = os.path.join(cls.save_dir, 'lenet')
+        fluid.dygraph.save_dygraph(dy_lenet.state_dict(), cls.weight_path)
        fluid.disable_dygraph()
+    @classmethod
+    def tearDownClass(cls):
+        shutil.rmtree(cls.save_dir)
    def test_fit_dygraph(self):
        self.fit(True)
    def test_fit_static(self):
        self.fit(False)
-    def not_test_evaluate_dygraph(self):
+    def test_evaluate_dygraph(self):
        self.evaluate(True)
-    def not_test_evaluate_static(self):
+    def test_evaluate_static(self):
        self.evaluate(False)
-    def not_test_predict_dygraph(self):
+    def test_predict_dygraph(self):
        self.predict(True)
-    def not_test_predict_static(self):
+    def test_predict_static(self):
        self.predict(False)
-    def fit(self, dynamic):
+    def predict(self, dynamic):
        fluid.enable_dygraph(self.device) if dynamic else None
+        inputs = [Input([-1, 1, 28, 28], 'float32', name='image')]
+        labels = [Input([None, 1], 'int64', name='label')]
+        test_dataloader = fluid.io.DataLoader(
+            self.test_dataset,
+            places=self.device,
+            batch_size=64,
+            return_list=True)
+        model = LeNet()
+        model.load(self.weight_path)
+        model.prepare(metrics=Accuracy(), inputs=inputs, labels=labels)
+        output = model.predict(test_dataloader, stack_outputs=True)
+    def fit(self, dynamic):
+        fluid.enable_dygraph(self.device) if dynamic else None
        seed = 333
        fluid.default_startup_program().random_seed = seed
        fluid.default_main_program().random_seed = seed
@@ -193,7 +222,7 @@ class TestModel(unittest.TestCase):
        model = LeNet()
        model.prepare(
            metrics=Accuracy(), inputs=self.inputs, labels=self.labels)
-        model.load_dict(self.trained_param)
+        model.load(self.weight_path)
        result = model.evaluate(self.val_dataset, batch_size=64)
        np.testing.assert_allclose(result['acc'], self.acc1)
        fluid.disable_dygraph() if dynamic else None
@@ -202,7 +231,7 @@ class TestModel(unittest.TestCase):
        fluid.enable_dygraph(self.device) if dynamic else None
        model = LeNet()
        model.prepare(inputs=self.inputs)
-        model.load_dict(self.trained_param)
+        model.load(self.weight_path)
        output = model.predict(
            self.test_dataset, batch_size=64, stack_outputs=True)
        np.testing.assert_equal(output[0].shape[0], len(self.test_dataset))
@@ -269,11 +298,10 @@ class TestModelFunction(unittest.TestCase):
                device=device)
            loss, = model.train_batch([data], [label])
-            print(loss, ref)
            np.testing.assert_allclose(loss.flatten(), ref.flatten())
            fluid.disable_dygraph() if dynamic else None
-    def not_test_test_batch(self, dynamic=True):
+    def test_test_batch(self, dynamic=True):
        dim = 20
        data = np.random.random(size=(4, dim)).astype(np.float32)
@@ -288,9 +316,9 @@ class TestModelFunction(unittest.TestCase):
        ref = get_expect()
        for dynamic in [True, False]:
-            self.set_seed()
            device = set_device('cpu')
            fluid.enable_dygraph(device) if dynamic else None
+            self.set_seed()
            model = MyModel()
            inputs = [Input([None, dim], 'float32', name='x')]
            model.prepare(inputs=inputs, device=device)
@@ -299,24 +327,29 @@ class TestModelFunction(unittest.TestCase):
            np.testing.assert_allclose(out, ref)
            fluid.disable_dygraph() if dynamic else None
-    def not_test_save_load(self):
+    def test_save_load(self):
        path = tempfile.mkdtemp()
        for dynamic in [True, False]:
            device = set_device('cpu')
            fluid.enable_dygraph(device) if dynamic else None
            model = MyModel()
+            inputs = [Input([None, 20], 'float32', name='x')]
+            model.prepare(inputs=inputs)
            model.save(path + '/test')
            model.load(path + '/test')
            shutil.rmtree(path)
            fluid.disable_dygraph() if dynamic else None
-    def not_test_parameters(self):
+    def test_parameters(self):
        for dynamic in [True, False]:
            device = set_device('cpu')
            fluid.enable_dygraph(device) if dynamic else None
            model = MyModel()
+            inputs = [Input([None, 20], 'float32', name='x')]
+            model.prepare(inputs=inputs)
            params = model.parameters()
-            self.assertTrue(params[0].shape == [20, 10])
+            self.assertTrue(params[0].shape[0] == 20)
+            self.assertTrue(params[0].shape[1] == 10)
            fluid.disable_dygraph() if dynamic else None

--- a/hapi/text/bert/__init__.py
+++ b/hapi/text/bert/__init__.py
@@ -13,7 +13,9 @@
 # limitations under the License.
 from hapi.text.bert.bert import BertConfig as BertConfig
-from hapi.text.bert.optimization import Optimizer as Optimizer
+from hapi.text.bert.dygraph_optimization import DyOptimizer as DyOptimizer
+from hapi.text.bert.static_optimization import StOptimizer as StOptimizer
+from hapi.text.bert.optimization import make_optimizer as make_optimizer
 from hapi.text.bert.dataloader import BertDataLoader as BertDataLoader
 from hapi.text.bert.dataloader import BertInputExample as BertInputExample
 from hapi.text.tokenizer import tokenization as tokenization

--- a/hapi/text/bert/bert.py
+++ b/hapi/text/bert/bert.py
@@ -23,8 +23,8 @@ import numpy as np
 import paddle
 import paddle.fluid as fluid
+from hapi.model import Model
 from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, to_variable, Layer, guard
 from hapi.text.text import PrePostProcessLayer, TransformerEncoder
 from hapi.text.bert.utils.init import init_from_static_model
@@ -52,7 +52,7 @@ class BertConfig(object):
        print('------------------------------------------------')
-class BertEncoder(Layer):
+class BertEncoder(Model):
    """
    bert
    """

--- a/hapi/text/bert/dygraph_optimization.py
+++ b/hapi/text/bert/dygraph_optimization.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Optimization and learning rate scheduling."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
+class ConstantLR(LearningRateDecay):
+    def __init__(self, learning_rate, begin=0, step=1, dtype='float32'):
+        super(ConstantLR, self).__init__(begin, step, dtype)
+        self.learning_rate = learning_rate
+    def step(self):
+        return self.learning_rate
+class LinearDecay(LearningRateDecay):
+    def __init__(self,
+                 learning_rate,
+                 warmup_steps,
+                 decay_steps,
+                 end_learning_rate=0.0001,
+                 power=1.0,
+                 cycle=False,
+                 begin=0,
+                 step=1,
+                 dtype='float32'):
+        super(LinearDecay, self).__init__(begin, step, dtype)
+        self.learning_rate = learning_rate
+        self.warmup_steps = warmup_steps
+        self.decay_steps = decay_steps
+        self.end_learning_rate = end_learning_rate
+        self.power = power
+        self.cycle = cycle
+    def step(self):
+        if self.step_num < self.warmup_steps:
+            decayed_lr = self.learning_rate * (self.step_num /
+                                               self.warmup_steps)
+            decayed_lr = self.create_lr_var(decayed_lr)
+        else:
+            tmp_step_num = self.step_num
+            tmp_decay_steps = self.decay_steps
+            if self.cycle:
+                div_res = fluid.layers.ceil(
+                    self.create_lr_var(tmp_step_num / float(self.decay_steps)))
+                if tmp_step_num == 0:
+                    div_res = self.create_lr_var(1.0)
+                tmp_decay_steps = self.decay_steps * div_res
+            else:
+                tmp_step_num = self.create_lr_var(
+                    tmp_step_num
+                    if tmp_step_num < self.decay_steps else self.decay_steps)
+                decayed_lr = (self.learning_rate - self.end_learning_rate) * \
+                    ((1 - tmp_step_num / tmp_decay_steps) ** self.power) + self.end_learning_rate
+        return decayed_lr
+class DyOptimizer(object):
+    def __init__(self,
+                 warmup_steps,
+                 num_train_steps,
+                 learning_rate,
+                 model_cls,
+                 weight_decay,
+                 scheduler='linear_warmup_decay',
+                 loss_scaling=1.0,
+                 parameter_list=None):
+        self.warmup_steps = warmup_steps
+        self.num_train_steps = num_train_steps
+        self.learning_rate = learning_rate
+        self.model_cls = model_cls
+        self.weight_decay = weight_decay
+        self.scheduler = scheduler
+        self.loss_scaling = loss_scaling
+        self.parameter_list = parameter_list
+        self.scheduled_lr = 0.0
+        self.optimizer = self.lr_schedule()
+    def lr_schedule(self):
+        if self.warmup_steps > 0:
+            if self.scheduler == 'noam_decay':
+                self.scheduled_lr = fluid.dygraph.NoamDecay(1 / (
+                    self.warmup_steps * (self.learning_rate**2)),
+                                                            self.warmup_steps)
+            elif self.scheduler == 'linear_warmup_decay':
+                self.scheduled_lr = LinearDecay(self.learning_rate,
+                                                self.warmup_steps,
+                                                self.num_train_steps, 0.0)
+            else:
+                raise ValueError("Unkown learning rate scheduler, should be "
+                                 "'noam_decay' or 'linear_warmup_decay'")
+            optimizer = fluid.optimizer.Adam(
+                learning_rate=self.scheduled_lr,
+                parameter_list=self.parameter_list)
+        else:
+            self.scheduled_lr = ConstantLR(self.learning_rate)
+            optimizer = fluid.optimizer.Adam(
+                learning_rate=self.scheduled_lr,
+                parameter_list=self.parameter_list)
+        return optimizer
+    def exclude_from_weight_decay(self, name):
+        if name.find("layer_norm") > -1:
+            return True
+        bias_suffix = ["_bias", "_b", ".b_0"]
+        for suffix in bias_suffix:
+            if name.endswith(suffix):
+                return True
+        return False
+    def state_dict(self):
+        return self.optimizer.state_dict()
+    def set_dict(self, state_dict):
+        return self.optimizer.set_dict(state_dict)
+    def get_opti_var_name_list(self):
+        return self.optimizer.get_opti_var_name_list()
+    def current_step_lr(self):
+        return self.optimizer.current_step_lr()
+    def minimize(self, loss, use_data_parallel=False, model=None):
+        param_list = dict()
+        clip_norm_thres = 1.0
+        #grad_clip = fluid.clip.GradientClipByGlobalNorm(clip_norm_thres)
+        if use_data_parallel:
+            loss = model.scale_loss(loss)
+        loss.backward()
+        if self.weight_decay > 0:
+            for param in self.model_cls.parameters():
+                param_list[param.name] = param * 1.0
+                param_list[param.name].stop_gradient = True
+        if use_data_parallel:
+            assert model is not None
+            model.apply_collective_grads()
+        #_, param_grads = self.optimizer.minimize(loss, grad_clip=grad_clip)
+        _, param_grads = self.optimizer.minimize(loss)
+        if self.weight_decay > 0:
+            for param, grad in param_grads:
+                if self.exclude_from_weight_decay(param.name):
+                    continue
+                if isinstance(self.scheduled_lr.step(), float):
+                    updated_param = param.numpy() - param_list[
+                        param.name].numpy(
+                        ) * self.weight_decay * self.scheduled_lr.step()
+                else:
+                    updated_param = param.numpy(
+                    ) - param_list[param.name].numpy(
+                    ) * self.weight_decay * self.scheduled_lr.step().numpy()
+                updated_param_var = fluid.dygraph.to_variable(updated_param)
+                param = updated_param_var
+                #param = fluid.layers.reshape(x=updated_param_var, shape=list(updated_param_var.shape))
--- a/hapi/text/bert/optimization.py
+++ b/hapi/text/bert/optimization.py
@@ -11,172 +11,35 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Optimization and learning rate scheduling."""
-from __future__ import absolute_import
+from paddle.fluid.framework import in_dygraph_mode
-from __future__ import division
+from hapi.text.bert.dygraph_optimization import DyOptimizer as DyOptimizer
-from __future__ import print_function
+from hapi.text.bert.static_optimization import StOptimizer as StOptimizer
-import numpy as np
-import paddle.fluid as fluid
+def make_optimizer(warmup_steps,
+                   num_train_steps,
-from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
+                   learning_rate,
+                   weight_decay,
+                   model,
-class ConstantLR(LearningRateDecay):
+                   scheduler='linear_warmup_decay',
-    def __init__(self, learning_rate, begin=0, step=1, dtype='float32'):
+                   loss_scaling=1.0,
-        super(ConstantLR, self).__init__(begin, step, dtype)
+                   parameter_list=None):
-        self.learning_rate = learning_rate
+    if in_dygraph_mode():
-    def step(self):
+        return DyOptimizer(
-        return self.learning_rate
+            warmup_steps=warmup_steps,
+            num_train_steps=num_train_steps,
+            learning_rate=learning_rate,
-class LinearDecay(LearningRateDecay):
+            model_cls=model,
-    def __init__(self,
+            weight_decay=weight_decay,
-                 learning_rate,
+            scheduler=scheduler,
-                 warmup_steps,
+            loss_scaling=loss_scaling,
-                 decay_steps,
+            parameter_list=parameter_list)
-                 end_learning_rate=0.0001,
+    else:
-                 power=1.0,
+        return StOptimizer(
-                 cycle=False,
+            warmup_steps=warmup_steps,
-                 begin=0,
+            num_train_steps=num_train_steps,
-                 step=1,
+            learning_rate=learning_rate,
-                 dtype='float32'):
+            weight_decay=weight_decay,
-        super(LinearDecay, self).__init__(begin, step, dtype)
+            scheduler=scheduler)
-        self.learning_rate = learning_rate
-        self.warmup_steps = warmup_steps
-        self.decay_steps = decay_steps
-        self.end_learning_rate = end_learning_rate
-        self.power = power
-        self.cycle = cycle
-    def step(self):
-        if self.step_num < self.warmup_steps:
-            decayed_lr = self.learning_rate * (self.step_num /
-                                               self.warmup_steps)
-            decayed_lr = self.create_lr_var(decayed_lr)
-        else:
-            tmp_step_num = self.step_num
-            tmp_decay_steps = self.decay_steps
-            if self.cycle:
-                div_res = fluid.layers.ceil(
-                    self.create_lr_var(tmp_step_num / float(self.decay_steps)))
-                if tmp_step_num == 0:
-                    div_res = self.create_lr_var(1.0)
-                tmp_decay_steps = self.decay_steps * div_res
-            else:
-                tmp_step_num = self.create_lr_var(
-                    tmp_step_num
-                    if tmp_step_num < self.decay_steps else self.decay_steps)
-                decayed_lr = (self.learning_rate - self.end_learning_rate) * \
-                    ((1 - tmp_step_num / tmp_decay_steps) ** self.power) + self.end_learning_rate
-        return decayed_lr
-class Optimizer(object):
-    def __init__(self,
-                 warmup_steps,
-                 num_train_steps,
-                 learning_rate,
-                 model_cls,
-                 weight_decay,
-                 scheduler='linear_warmup_decay',
-                 loss_scaling=1.0,
-                 parameter_list=None):
-        self.warmup_steps = warmup_steps
-        self.num_train_steps = num_train_steps
-        self.learning_rate = learning_rate
-        self.model_cls = model_cls
-        self.weight_decay = weight_decay
-        self.scheduler = scheduler
-        self.loss_scaling = loss_scaling
-        self.parameter_list = parameter_list
-        self.scheduled_lr = 0.0
-        self.optimizer = self.lr_schedule()
-    def lr_schedule(self):
-        if self.warmup_steps > 0:
-            if self.scheduler == 'noam_decay':
-                self.scheduled_lr = fluid.dygraph.NoamDecay(1 / (
-                    self.warmup_steps * (self.learning_rate**2)),
-                                                            self.warmup_steps)
-            elif self.scheduler == 'linear_warmup_decay':
-                self.scheduled_lr = LinearDecay(self.learning_rate,
-                                                self.warmup_steps,
-                                                self.num_train_steps, 0.0)
-            else:
-                raise ValueError("Unkown learning rate scheduler, should be "
-                                 "'noam_decay' or 'linear_warmup_decay'")
-            optimizer = fluid.optimizer.Adam(
-                learning_rate=self.scheduled_lr,
-                parameter_list=self.parameter_list)
-        else:
-            self.scheduled_lr = ConstantLR(self.learning_rate)
-            optimizer = fluid.optimizer.Adam(
-                learning_rate=self.scheduled_lr,
-                parameter_list=self.parameter_list)
-        return optimizer
-    def exclude_from_weight_decay(self, name):
-        if name.find("layer_norm") > -1:
-            return True
-        bias_suffix = ["_bias", "_b", ".b_0"]
-        for suffix in bias_suffix:
-            if name.endswith(suffix):
-                return True
-        return False
-    def state_dict(self):
-        return self.optimizer.state_dict()
-    def set_dict(self, state_dict):
-        return self.optimizer.set_dict(state_dict)
-    def get_opti_var_name_list(self):
-        return self.optimizer.get_opti_var_name_list()
-    def current_step_lr(self):
-        return self.optimizer.current_step_lr()
-    def minimize(self, loss, use_data_parallel=False, model=None):
-        param_list = dict()
-        clip_norm_thres = 1.0
-        #grad_clip = fluid.clip.GradientClipByGlobalNorm(clip_norm_thres)
-        if use_data_parallel:
-            loss = model.scale_loss(loss)
-        loss.backward()
-        if self.weight_decay > 0:
-            for param in self.model_cls.parameters():
-                param_list[param.name] = param * 1.0
-                param_list[param.name].stop_gradient = True
-        if use_data_parallel:
-            assert model is not None
-            model.apply_collective_grads()
-        #_, param_grads = self.optimizer.minimize(loss, grad_clip=grad_clip)
-        _, param_grads = self.optimizer.minimize(loss)
-        if self.weight_decay > 0:
-            for param, grad in param_grads:
-                if self.exclude_from_weight_decay(param.name):
-                    continue
-                if isinstance(self.scheduled_lr.step(), float):
-                    updated_param = param.numpy() - param_list[
-                        param.name].numpy(
-                        ) * self.weight_decay * self.scheduled_lr.step()
-                else:
-                    updated_param = param.numpy(
-                    ) - param_list[param.name].numpy(
-                    ) * self.weight_decay * self.scheduled_lr.step().numpy()
-                updated_param_var = fluid.dygraph.to_variable(updated_param)
-                param = updated_param_var
-                #param = fluid.layers.reshape(x=updated_param_var, shape=list(updated_param_var.shape))
--- a/hapi/text/bert/static_optimization.py
+++ b/hapi/text/bert/static_optimization.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 import numpy as np
 import paddle.fluid as fluid
-from utils.fp16 import create_master_params_grads, master_param_to_train_param, apply_dynamic_loss_scaling
 def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps):
@@ -51,128 +50,95 @@ def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps):
        return lr
-def optimization(loss,
+class StOptimizer(fluid.optimizer.Optimizer):
+    def __init__(self,
                 warmup_steps,
                 num_train_steps,
                 learning_rate,
-                 train_program,
-                 startup_prog,
                 weight_decay,
-                 scheduler='linear_warmup_decay',
+                 scheduler='linear_warmup_decay'):
-                 use_fp16=False,
+        super(StOptimizer, self).__init__(
-                 use_dynamic_loss_scaling=False,
+            learning_rate=learning_rate,
-                 init_loss_scaling=1.0,
+            parameter_list=None,
-                 incr_every_n_steps=1000,
+            regularization=None,
-                 decr_every_n_nan_or_inf=2,
+            grad_clip=None,
-                 incr_ratio=2.0,
+            name=None)
-                 decr_ratio=0.8):
+        self.warmup_steps = warmup_steps
+        self.num_train_steps = num_train_steps
-    scheduled_lr, loss_scaling = None, None
+        self.learning_rate = learning_rate
-    if scheduler == 'noam_decay':
+        self.weight_decay = weight_decay
-        if warmup_steps > 0:
+        self.scheduler = scheduler
-            scheduled_lr = fluid.layers.learning_rate_scheduler\
-             .noam_decay(1/(warmup_steps *(learning_rate ** 2)),
+    def minimize(self, loss):
-           warmup_steps)
+        train_program = fluid.default_main_program()
+        startup_program = fluid.default_startup_program()
+        if self.scheduler == 'noam_decay':
+            if self.warmup_steps > 0:
+                scheduled_lr = fluid.layers.learning_rate_scheduler\
+                 .noam_decay(1/(self.warmup_steps *(self.learning_rate ** 2)),
+                self.warmup_steps)
+            else:
+                print(
+                    "WARNING: noam decay of learning rate should have postive warmup "
+                    "steps but given {}, using constant learning rate instead!"
+                    .format(self.warmup_steps))
+                scheduled_lr = fluid.layers.create_global_var(
+                    name=fluid.unique_name.generate("learning_rate"),
+                    shape=[1],
+                    value=self.learning_rate,
+                    dtype='float32',
+                    persistable=True)
+        elif self.scheduler == 'linear_warmup_decay':
+            if self.warmup_steps > 0:
+                scheduled_lr = linear_warmup_decay(self.learning_rate,
+                                                   self.warmup_steps,
+                                                   self.num_train_steps)
+            else:
+                print(
+                    "WARNING: linear warmup decay of learning rate should have "
+                    "postive warmup steps but given {}, use constant learning rate "
+                    "instead!".format(self.warmup_steps))
+                scheduled_lr = fluid.layers.create_global_var(
+                    name=fluid.unique_name.generate("learning_rate"),
+                    shape=[1],
+                    value=self.learning_rate,
+                    dtype='float32',
+                    persistable=True)
        else:
-            print(
+            raise ValueError("Unkown learning rate scheduler, should be "
-                "WARNING: noam decay of learning rate should have postive warmup "
+                             "'noam_decay' or 'linear_warmup_decay'")
-                "steps but given {}, using constant learning rate instead!"
-                .format(warmup_steps))
-            scheduled_lr = fluid.layers.create_global_var(
-                name=fluid.unique_name.generate("learning_rate"),
-                shape=[1],
-                value=learning_rate,
-                dtype='float32',
-                persistable=True)
-    elif scheduler == 'linear_warmup_decay':
-        if warmup_steps > 0:
-            scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps,
-                                               num_train_steps)
-        else:
-            print(
-                "WARNING: linear warmup decay of learning rate should have "
-                "postive warmup steps but given {}, use constant learning rate "
-                "instead!".format(warmup_steps))
-            scheduled_lr = fluid.layers.create_global_var(
-                name=fluid.unique_name.generate("learning_rate"),
-                shape=[1],
-                value=learning_rate,
-                dtype='float32',
-                persistable=True)
-    else:
-        raise ValueError("Unkown learning rate scheduler, should be "
-                         "'noam_decay' or 'linear_warmup_decay'")
-    optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
-    fluid.clip.set_gradient_clip(
-        clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))
-    def exclude_from_weight_decay(param):
-        name = param.name.rstrip(".master")
-        if name.find("layer_norm") > -1:
-            return True
-        bias_suffix = ["_bias", "_b", ".b_0"]
-        for suffix in bias_suffix:
-            if name.endswith(suffix):
-                return True
-        return False
-    param_list = dict()
-    if use_fp16:
-        loss_scaling = fluid.layers.create_global_var(
-            name=fluid.unique_name.generate("loss_scaling"),
-            shape=[1],
-            value=init_loss_scaling,
-            dtype='float32',
-            persistable=True)
-        loss *= loss_scaling
-        param_grads = optimizer.backward(loss)
-        master_param_grads = create_master_params_grads(
-            param_grads, train_program, startup_prog, loss_scaling)
-        if weight_decay > 0:
-            for param, _ in master_param_grads:
-                param_list[param.name] = param * 1.0
-                param_list[param.name].stop_gradient = True
-        if use_dynamic_loss_scaling:
+        optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
-            apply_dynamic_loss_scaling(
+        fluid.clip.set_gradient_clip(
-                loss_scaling, master_param_grads, incr_every_n_steps,
+            clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))
-                decr_every_n_nan_or_inf, incr_ratio, decr_ratio)
-        optimizer.apply_gradients(master_param_grads)
+        def exclude_from_weight_decay(param):
+            name = param.name.rstrip(".master")
-        if weight_decay > 0:
+            if name.find("layer_norm") > -1:
-            for param, grad in master_param_grads:
+                return True
-                if exclude_from_weight_decay(param):
+            bias_suffix = ["_bias", "_b", ".b_0"]
-                    continue
+            for suffix in bias_suffix:
-                with param.block.program._optimized_guard(
+                if name.endswith(suffix):
-                    [param, grad]), fluid.framework.name_scope("weight_decay"):
+                    return True
-                    updated_param = param - param_list[
+            return False
-                        param.name] * weight_decay * scheduled_lr
-                    fluid.layers.assign(output=param, input=updated_param)
-        master_param_to_train_param(master_param_grads, param_grads,
+        param_list = dict()
-                                    train_program)
-    else:
+        if self.weight_decay > 0:
-        if weight_decay > 0:
            for param in train_program.all_parameters():
                param_list[param.name] = param * 1.0
                param_list[param.name].stop_gradient = True
        _, param_grads = optimizer.minimize(loss)
-        if weight_decay > 0:
+        if self.weight_decay > 0:
            for param, grad in param_grads:
                if exclude_from_weight_decay(param):
                    continue
                with param.block.program._optimized_guard(
                    [param, grad]), fluid.framework.name_scope("weight_decay"):
                    updated_param = param - param_list[
-                        param.name] * weight_decay * scheduled_lr
+                        param.name] * self.weight_decay * scheduled_lr
                    fluid.layers.assign(output=param, input=updated_param)
-    return scheduled_lr, loss_scaling
--- a/hapi/text/senta/__init__.py
+++ b/hapi/text/senta/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from hapi.text.senta.data_processer import SentaProcessor
--- a/hapi/text/senta/data_processer.py
+++ b/hapi/text/senta/data_processer.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+from hapi.text.senta.data_reader import load_vocab
+from hapi.text.senta.data_reader import data_reader
+from paddle.io import DataLoader
+class SentaProcessor(object):
+    def __init__(self, data_dir, vocab_path, random_seed=None):
+        self.data_dir = data_dir
+        self.vocab = load_vocab(vocab_path)
+        self.num_examples = {"train": -1, "dev": -1, "infer": -1}
+        np.random.seed(random_seed)
+    def get_train_examples(self, data_dir, epoch, shuffle, batch_size, places, padding_size):
+        train_reader = data_reader((self.data_dir + "/train.tsv"), self.vocab,
+                           self.num_examples, "train", epoch, padding_size, shuffle)
+        loader = DataLoader.from_generator(capacity=50, return_list=True)
+        loader.set_sample_generator(train_reader, batch_size=batch_size, drop_last=False, places=places)
+        return loader
+    def get_dev_examples(self, data_dir, epoch, shuffle, batch_size, places, padding_size):
+        dev_reader = data_reader((self.data_dir + "/dev.tsv"), self.vocab,
+                           self.num_examples, "dev", epoch, padding_size, shuffle)
+        loader = DataLoader.from_generator(capacity=50, return_list=True)
+        loader.set_sample_generator(dev_reader, batch_size=batch_size, drop_last=False, places=places)
+        return loader
+    def get_test_examples(self, data_dir, epoch, batch_size, places, padding_size):
+        test_reader = data_reader((self.data_dir + "/test.tsv"), self.vocab,
+                           self.num_examples, "infer", epoch, padding_size)
+        loader = DataLoader.from_generator(capacity=50, return_list=True)
+        loader.set_sample_generator(test_reader, batch_size=batch_size, drop_last=False, places=places)
+        return loader
+    def get_labels(self):
+        return ["0", "1"]
+    def get_num_examples(self, phase):
+        if phase not in ['train', 'dev', 'infer']:
+            raise ValueError(
+                "Unknown phase, which should be in ['train', 'dev', 'infer'].")
+        return self.num_examples[phase]
+    def get_train_progress(self):
+        return self.current_train_example, self.current_train_epoch
+    def data_generator(self, padding_size, batch_size, places, phase='train', epoch=1, shuffle=True):
+        if phase == "train":
+            return self.get_train_examples(self.data_dir, epoch, shuffle, batch_size, places, padding_size)
+        elif phase == "dev":
+            return self.get_dev_examples(self.data_dir, epoch, shuffle, batch_size, places, padding_size)
+        elif phase == "infer":
+            return self.get_test_examples(self.data_dir, epoch, batch_size, places, padding_size)
+        else:
+            raise ValueError(
+                "Unknown phase, which should be in ['train', 'dev', 'infer'].")
--- a/hapi/text/senta/data_reader.py
+++ b/hapi/text/senta/data_reader.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import io
+import sys
+import random
+def str2bool(v):
+    return v.lower() in ("true", "t", "1")
+def data_reader(file_path, word_dict, num_examples, phrase, epoch, padding_size, shuffle=False):
+    unk_id = len(word_dict)
+    all_data = []
+    with io.open(file_path, "r", encoding='utf8') as fin:
+        for line in fin:
+            if line.startswith('text_a'):
+                continue
+            cols = line.strip().split("\t")
+            if len(cols) != 2:
+                sys.stderr.write("[NOTICE] Error Format Line!")
+                continue
+            label = [int(cols[1])]
+            wids = [
+                word_dict[x] if x in word_dict else unk_id
+                for x in cols[0].split(" ")
+            ]
+            wids = wids[:padding_size]
+            while len(wids) < padding_size:
+                wids.append(unk_id)
+            all_data.append((wids, label))
+    if shuffle:
+        if phrase == "train":
+            random.shuffle(all_data)
+    num_examples[phrase] = len(all_data)
+    def reader():
+        for epoch_index in range(epoch):
+            for doc, label in all_data:
+                yield doc, label
+    return reader
+def load_vocab(file_path):
+    vocab = {}
+    with io.open(file_path, 'r', encoding='utf8') as f:
+        wid = 0
+        for line in f:
+            if line.strip() not in vocab:
+                vocab[line.strip()] = wid
+                wid += 1
+    vocab["<unk>"] = len(vocab)
+    return vocab
--- a/hapi/text/text.py
+++ b/hapi/text/text.py
@@ -1096,7 +1096,8 @@ class PrePostProcessLayer(Layer):
        self.functors = []
        for cmd in self.process_cmd:
            if cmd == "a":  # add residual connection
-                self.functors.append(lambda x, y: x + y if y else x)
+                self.functors.append(
+                    lambda x, y: x + y if y is not None else x)
            elif cmd == "n":  # add layer normalization
                if reused_layer_norm is not None:
                    layer_norm = reused_layer_norm
@@ -1218,7 +1219,7 @@ class MultiHeadAttention(Layer):
        # scale dot product attention
        product = layers.matmul(
            x=q, y=k, transpose_y=True, alpha=self.d_model**-0.5)
-        if attn_bias:
+        if attn_bias is not None:
            product += attn_bias
        weights = layers.softmax(product)
        if self.dropout_rate: