Merge pull request #1 from PaddlePaddle/develop

update

Merge pull request #1 from PaddlePaddle/develop
update
830e2b7e · zhanghan · GitHub · 80e9ab69 · f58a3c56 · 830e2b7e
137 changed file
--- a/.gitignore
+++ b/.gitignore
 *.pyc
 *.un~
+*.swp
+*.egg-info/
--- a/.metas/ernie_distill.png
+++ b/.metas/ernie_distill.png
--- a/.metas/ernie_tiny.png
+++ b/.metas/ernie_tiny.png
--- a/BERT/.run_ce.sh
+++ b/BERT/.run_ce.sh
-export FLAGS_enable_parallel_graph=1
-export FLAGS_sync_nccl_allreduce=1
-BERT_BASE_PATH="chinese_L-12_H-768_A-12"
-TASK_NAME='xnli'
-DATA_PATH=data/xnli/XNLI-MT-1.0
-CKPT_PATH=pretrain_model
-train(){
-python -u run_classifier.py --task_name ${TASK_NAME} \
-                   --use_cuda true \
-                   --do_train true \
-                   --do_val false \
-                   --do_test false \
-                   --batch_size 8192 \
-                   --in_tokens true \
-                   --init_checkpoint pretrain_model/chinese_L-12_H-768_A-12/ \
-                   --data_dir ${DATA_PATH} \
-                   --vocab_path pretrain_model/chinese_L-12_H-768_A-12/vocab.txt \
-                   --checkpoints ${CKPT_PATH} \
-                   --save_steps 1000 \
-                   --weight_decay  0.01 \
-                   --warmup_proportion 0.0 \
-                   --validation_steps 25 \
-                   --epoch 1 \
-                   --max_seq_len 512 \
-                   --bert_config_path pretrain_model/chinese_L-12_H-768_A-12/bert_config.json \
-                   --learning_rate 1e-4 \
-                   --skip_steps 10 \
-                   --random_seed 100 \
-                   --enable_ce \
-                   --shuffle false
-}
-export CUDA_VISIBLE_DEVICES=0
-train | python _ce.py
-export CUDA_VISIBLE_DEVICES=0,1,2,3
-train | python _ce.py
--- a/BERT/README.md
+++ b/BERT/README.md
-Hi!
-This directory has been deprecated.
-Please visit the project at [models/PaddleNLP/language_representations_kit/BERT](https://github.com/PaddlePaddle/models/tree/develop/PaddleNLP/language_representations_kit/BERT).
--- a/ELMo/.run_ce.sh
+++ b/ELMo/.run_ce.sh
-train() {
-python  train.py \
--train_path='data/train/sentence_file_*'  \
--test_path='data/dev/sentence_file_*'  \
--vocab_path data/vocabulary_min5k.txt \
--learning_rate 0.2 \
--use_gpu True \
--all_train_tokens 35479 \
--max_epoch 10 \
--log_interval 5 \
--dev_interval 20 \
--local True $@ \
--enable_ce \
--shuffle false \
--random_seed 100
-}
-export CUDA_VISIBLE_DEVICES=0 
-train | python _ce.py
-export CUDA_VISIBLE_DEVICES=0,1,2,3 
-train | python _ce.py
--- a/ELMo/README.md
+++ b/ELMo/README.md
-Hi!
-This directory has been deprecated.
-Please visit the project at [models/PaddleNLP/language_representations_kit/ELMo](
-https://github.com/PaddlePaddle/models/tree/develop/PaddleNLP/language_representations_kit/ELMo).
--- a/ERNIE/README.md
+++ b/ERNIE/README.md
-<div align="center">
-    <h1>
-        <font color="red">
-        ERNIE 项目已经迁移至 <a href="../README.zh.md">这里</a>
-        </font>
-    </h1>
-</div>
-&nbsp;
-&nbsp;
-&nbsp;
-&nbsp;
-&nbsp;
-&nbsp;
-&nbsp;
-&nbsp;
-&nbsp;
-&nbsp;
-&nbsp;
-&nbsp;
-&nbsp;
-&nbsp;
-&nbsp;
-&nbsp;
-## ERNIE: **E**nhanced **R**epresentation through k**N**owledge **I**nt**E**gration
-**** **2019-04-10 更新**: update ERNIE_stable-1.0.1.tar.gz, 将模型参数、配置 ernie_config.json、vocab.txt 打包发布 ****
-**** **2019-03-18 更新**: update ERNIE_stable.tgz ****
-**ERNIE** 通过建模海量数据中的词、实体及实体关系，学习真实世界的语义知识。相较于 **BERT** 学习原始语言信号，**ERNIE** 直接对先验语义知识单元进行建模，增强了模型语义表示能力。
-这里我们举个例子：
-```Learnt by BERT ：哈 [mask] 滨是 [mask] 龙江的省会，[mask] 际冰 [mask] 文化名城。```
-```Learnt by ERNIE：[mask] [mask] [mask] 是黑龙江的省会，国际 [mask] [mask] 文化名城。```
-在 **BERT** 模型中，我们通过『哈』与『滨』的局部共现，即可判断出『尔』字，模型没有学习与『哈尔滨』相关的任何知识。而 **ERNIE** 通过学习词与实体的表达，使模型能够建模出『哈尔滨』与『黑龙江』的关系，学到『哈尔滨』是 『黑龙江』的省会以及『哈尔滨』是个冰雪城市。
-训练数据方面，除百科类、资讯类中文语料外，**ERNIE** 还引入了论坛对话类数据，利用 **DLM**（Dialogue Language Model）建模 Query-Response 对话结构，将对话 Pair 对作为输入，引入 Dialogue Embedding 标识对话的角色，利用 Dialogue Response Loss 学习对话的隐式关系，进一步提升模型的语义表示能力。
-我们在自然语言推断，语义相似度，命名实体识别，情感分析，问答匹配 5 个公开的中文数据集合上进行了效果验证，**ERNIE** 模型相较 **BERT** 取得了更好的效果。
-<table>
-  <tbody>
-    <tr>
-      <th><strong>数据集</strong>
-        <br></th>
-      <th colspan="2"><strong>XNLI</strong></th>
-      <th colspan="2"><strong>LCQMC</strong></th>
-      <th colspan="2"><strong>MSRA-NER(SIGHAN 2006)</strong></th>
-      <th colspan="2"><strong>ChnSentiCorp</strong></th>
-      <th colspan="4"><strong>nlpcc-dbqa</strong></th></tr>
-    <tr>
-      <td rowspan="2">
-        <p>
-          <strong>评估</strong></p>
-        <p>
-          <strong>指标</strong>
-          <br></p>
-      </td>
-      <td colspan="2">
-        <strong>acc</strong>
-        <br></td>
-      <td colspan="2">
-        <strong>acc</strong>
-        <br></td>
-      <td colspan="2">
-        <strong>f1-score</strong>
-        <br></td>
-      <td colspan="2">
-        <strong>acc</strong>
-        <strong></strong>
-        <br></td>
-      <td colspan="2">
-        <strong>mrr</strong>
-        <br></td>
-      <td colspan="2">
-        <strong>f1-score</strong>
-        <br></td>
-    </tr>
-    <tr>
-      <th colspan="1" width="">
-        <strong>dev</strong>
-        <br></th>
-      <td colspan="1" width="">
-        <strong>test</strong>
-        <br></td>
-      <td colspan="1" width="">
-        <strong>dev</strong>
-        <br></td>
-      <td colspan="1" width="">
-        <strong>test</strong>
-        <br></td>
-      <td colspan="1" width="">
-        <strong>dev</strong>
-        <br></td>
-      <td colspan="1" width="">
-        <strong>test</strong>
-        <br></td>
-      <td colspan="1" width="">
-        <strong>dev</strong>
-        <br></td>
-      <td colspan="1" width="">
-        <strong>test</strong>
-        <br></td>
-      <td colspan="1" width="">
-        <strong>dev</strong>
-        <br></td>
-      <td colspan="1" width="">
-        <strong>test</strong>
-        <br></td>
-      <td colspan="1" width="">
-        <strong>dev</strong>
-        <br></td>
-      <td colspan="1" width="">
-        <strong>test</strong>
-        <br></td>
-    </tr>
-    <tr>
-      <td>
-        <strong>BERT
-          <br></strong></td>
-      <td>78.1</td>
-      <td>77.2</td>
-      <td>88.8</td>
-      <td>87.0</td>
-      <td>94.0
-        <br></td>
-      <td>
-        <span>92.6</span></td>
-      <td>94.6</td>
-      <td>94.3</td>
-      <td colspan="1">94.7</td>
-      <td colspan="1">94.6</td>
-      <td colspan="1">80.7</td>
-      <td colspan="1">80.8</td></tr>
-    <tr>
-      <td>
-        <strong>ERNIE
-          <br></strong></td>
-      <td>79.9 <span>(<strong>+1.8</strong>)</span></td>
-      <td>78.4 <span>(<strong>+1.2</strong>)</span></td>
-      <td>89.7 <span>(<strong>+0.9</strong>)</span></td>
-      <td>87.4 <span>(<strong>+0.4</strong>)</span></td>
-      <td>95.0 <span>(<strong>+1.0</strong>)</span></td>
-      <td>93.8 <span>(<strong>+1.2</strong>)</span></td>
-      <td>95.2 <span>(<strong>+0.6</strong>)</span></td>
-      <td>95.4 <span>(<strong>+1.1</strong>)</span></td>
-      <td colspan="1">95.0 <span>(<strong>+0.3</strong>)</span></td>
-      <td colspan="1">95.1 <span>(<strong>+0.5</strong>)</span></td>
-      <td colspan="1">82.3 <span>(<strong>+1.6</strong>)</span></td>
-      <td colspan="1">82.7 <span>(<strong>+1.9</strong>)</span></td></tr>
-  </tbody>
-</table>
- - **自然语言推断任务** XNLI
-```text
-XNLI 由 Facebook 和纽约大学的研究者联合构建，旨在评测模型多语言的句子理解能力。目标是判断两个句子的关系（矛盾、中立、蕴含）。[链接: https://github.com/facebookresearch/XNLI]
-```
- - **语义相似度** LCQMC
-```text
-LCQMC 是哈尔滨工业大学在自然语言处理国际顶会 COLING2018 构建的问答匹配数据集，其目标是判断两个问题的语义是否相同。[链接: http://aclweb.org/anthology/C18-1166]
-```
- - **命名实体识别任务** MSRA-NER(SIGHAN 2006)
-```text
-MSRA-NER(SIGHAN 2006) 数据集由微软亚研院发布，其目标是命名实体识别，是指识别文本中具有特定意义的实体，主要包括人名、地名、机构名等。
-```
- - **情感分析任务** ChnSentiCorp
-```text
-ChnSentiCorp 是中文情感分析数据集，其目标是判断一段话的情感态度。
-```
- - **检索式问答任务** nlpcc-dbqa
- ```text
-nlpcc-dbqa是由国际自然语言处理和中文计算会议NLPCC于2016年举办的评测任务，其目标是选择能够回答问题的答案。[链接: http://tcci.ccf.org.cn/conference/2016/dldoc/evagline2.pdf]
-```
-### 模型&数据
-1) 预训练模型下载
-| Model | Description |
-| :------| :------ |
-| [模型](https://ernie.bj.bcebos.com/ERNIE_stable.tgz) | 包含预训练模型参数 |
-| [模型(含配置文件及词典)](https://baidu-nlp.bj.bcebos.com/ERNIE_stable-1.0.1.tar.gz)) | 包含预训练模型参数、词典 vocab.txt、模型配置 ernie_config.json|
-2) [任务数据下载](https://ernie.bj.bcebos.com/task_data_zh.tgz)
-### 安装
-本项目依赖于 Paddle Fluid 1.3.1，请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。
-**Note**: 预训练任务和finetune任务测试机器为P40, 显存22G；如果显存低于22G, 某些任务可能会因显存不足报错；
-### 预训练
-#### 数据预处理
-基于百科类、资讯类、论坛对话类数据构造具有上下文关系的句子对数据，利用百度内部词法分析工具对句对数据进行字、词、实体等不同粒度的切分，然后基于 [`tokenization.py`](tokenization.py) 中的 CharTokenizer 对切分后的数据进行 token 化处理，得到明文的 token 序列及切分边界，然后将明文数据根据词典 [`config/vocab.txt`](config/vocab.txt) 映射为 id 数据，在训练过程中，根据切分边界对连续的 token 进行随机 mask 操作；
-我们给出了 id 化后的部分训练数据：[`data/demo_train_set.gz`](./data/demo_train_set.gz`)、和测试数据：[`data/demo_valid_set.gz`](./data/demo_valid_set.gz)，每行数据为1个训练样本，示例如下:
-```
-1 1048 492 1333 1361 1051 326 2508 5 1803 1827 98 164 133 2777 2696 983 121 4 19 9 634 551 844 85 14 2476 1895 33 13 983 121 23 7 1093 24 46 660 12043 2 1263 6 328 33 121 126 398 276 315 5 63 44 35 25 12043 2;0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1;0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55;-1 0 0 0 0 1 0 1 0 0 1 0 0 1 0 1 0 0 0 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 -1 0 0 0 1 0 0 1 0 1 0 0 1 0 1 0 -1;0
-```
-每个样本由5个 '`;`' 分隔的字段组成，数据格式: `token_ids; sentence_type_ids; position_ids; seg_labels; next_sentence_label`；其中 `seg_labels` 表示分词边界信息: 0表示词首、1表示非词首、-1为占位符, 其对应的词为 `CLS` 或者 `SEP`；
-#### 开始训练
-预训练任务的启动脚本是 [`script/pretrain.sh`](./script/pretrain.sh)，
-在开始预训练之前需要把 CUDA、cuDNN、NCCL2 等动态库路径加入到环境变量 LD_LIBRARY_PATH 之中；然后执行 `bash script/pretrain.sh` 就可以基于demo数据和默认参数配置开始预训练；
-预训练任务进行的过程中会输出当前学习率、训练数据所经过的轮数、当前迭代的总步数、训练误差、训练速度等信息，根据 --validation_steps ${N} 的配置，每间隔 N 步输出模型在验证集的各种指标:
-```
-current learning_rate:0.000001
-epoch: 1, progress: 1/1, step: 30, loss: 10.540648, ppl: 19106.925781, next_sent_acc: 0.625000, speed: 0.849662 steps/s, file: ./data/demo_train_set.gz, mask_type: mask_word
-feed_queue size 70
-current learning_rate:0.000001
-epoch: 1, progress: 1/1, step: 40, loss: 10.529287, ppl: 18056.654297, next_sent_acc: 0.531250, speed: 0.849549 steps/s, file: ./data/demo_train_set.gz, mask_type: mask_word
-feed_queue size 70
-current learning_rate:0.000001
-epoch: 1, progress: 1/1, step: 50, loss: 10.360563, ppl: 16398.287109, next_sent_acc: 0.625000, speed: 0.843776 steps/s, file: ./data/demo_train_set.gz, mask_type: mask_word
-```
-如果用自定义的真实数据进行训练，请参照[`script/pretrain.sh`](./script/pretrain.sh)脚本对参数做相应修改。
-### Fine-tuning 任务
-在完成 ERNIE 模型的预训练后，即可利用预训练参数在特定的 NLP 任务上做 Fine-tuning。以下基于 ERNIE 的预训练模型，示例如何进行分类任务和序列标注任务的 Fine-tuning，如果要运行这些任务，请通过 [模型&数据](#模型-数据) 一节提供的链接预先下载好对应的预训练模型。
-将下载的模型解压到 `${MODEL_PATH}` 路径下，`${MODEL_PATH}` 路径下包含模型参数目录 `params`;
-将下载的任务数据解压到 `${TASK_DATA_PATH}` 路径下，`${TASK_DATA_PATH}` 路径包含 `LCQMC`、`XNLI`、`MSRA-NER`、`ChnSentCorp`、 `nlpcc-dbqa` 5个任务的训练数据和测试数据；
-#### 单句和句对分类任务
-1) **单句分类任务**:
- 以 `ChnSentiCorp` 情感分类数据集作为单句分类任务示例，数据格式为包含2个字段的tsv文件，2个字段分别为: `text_a  label`, 示例数据如下:
- ```
-label  text_a
-0   当当网名不符实，订货多日不见送货，询问客服只会推托，只会要求用户再下订单。如此服务留不住顾客的。去别的网站买书服务更好。
-0   XP的驱动不好找！我的17号提的货，现在就降价了100元，而且还送杀毒软件！
-1   <荐书> 推荐所有喜欢<红楼>的红迷们一定要收藏这本书,要知道当年我听说这本书的时候花很长时间去图书馆找和借都没能如愿,所以这次一看到当当有,马上买了,红迷们也要记得备货哦!
- ```
-执行 `bash script/run_ChnSentiCorp.sh` 即可开始finetune，执行结束后会输出如下所示的在验证集和测试集上的测试结果:
-```
-[dev evaluation] ave loss: 0.189373, ave acc: 0.954167, data_num: 1200, elapsed time: 14.984404 s
-[test evaluation] ave loss: 0.189387, ave acc: 0.950000, data_num: 1200, elapsed time: 14.737691 s
-```
-2) **句对分类任务**：
-以 `LCQMC` 语义相似度任务作为句对分类任务示例，数据格式为包含3个字段的tsv文件，3个字段分别为: `text_a    text_b   label`，示例数据如下:
-```
-text_a  text_b  label
-开初婚未育证明怎么弄？  初婚未育情况证明怎么开？    1
-谁知道她是网络美女吗？  爱情这杯酒谁喝都会醉是什么歌    0
-这腰带是什么牌子    护腰带什么牌子好    0
-```
-执行 `bash script/run_lcqmc.sh` 即可开始finetune，执行结束后会输出如下所示的在验证集和测试集上的测试结果:
-```
-[dev evaluation] ave loss: 0.290925, ave acc: 0.900704, data_num: 8802, elapsed time: 32.240948 s
-[test evaluation] ave loss: 0.345714, ave acc: 0.878080, data_num: 12500, elapsed time: 39.738015 s
-```
-#### 序列标注任务
-1) **实体识别**
- 以 `MSRA-NER(SIGHAN 2006)` 作为示例，数据格式为包含2个字段的tsv文件，2个字段分别为: `text_a  label`, 示例数据如下:
- ```
-text_a  label
-在 这 里 恕 弟 不 恭 之 罪 ， 敢 在 尊 前 一 诤 ： 前 人 论 书 ， 每 曰 “ 字 字 有 来 历 ， 笔 笔 有 出 处 ” ， 细 读 公 字 ， 何 尝 跳 出 前 人 藩 篱 ， 自 隶 变 而 后 ， 直 至 明 季 ， 兄 有 何 新 出 ？    O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O
-相 比 之 下 ， 青 岛 海 牛 队 和 广 州 松 日 队 的 雨 中 之 战 虽 然 也 是 0 ∶ 0 ， 但 乏 善 可 陈 。   O O O O O B-ORG I-ORG I-ORG I-ORG I-ORG O B-ORG I-ORG I-ORG I-ORG I-ORG O O O O O O O O O O O O O O O O O O O
-理 由 多 多 ， 最 无 奈 的 却 是 ： 5 月 恰 逢 双 重 考 试 ， 她 攻 读 的 博 士 学 位 论 文 要 通 考 ； 她 任 教 的 两 所 学 校 ， 也 要 在 这 段 时 日 大 考 。    O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O
- ```
-执行 `bash script/run_msra_ner.sh` 即可开始 finetune，执行结束后会输出如下所示的在验证集和测试集上的测试结果:
-```
-[dev evaluation] f1: 0.951949, precision: 0.944636, recall: 0.959376, elapsed time: 19.156693 s
-[test evaluation] f1: 0.937390, precision: 0.925988, recall: 0.949077, elapsed time: 36.565929 s
-```
-### FAQ
-#### 如何获取输入句子经过 ERNIE 编码后的 Embedding 表示?
-可以通过 ernie_encoder.py 抽取出输入句子的 Embedding 表示和句子中每个 token 的 Embedding 表示，数据格式和 [Fine-tuning 任务](#Fine-tuning-任务) 一节中介绍的各种类型 Fine-tuning 任务的训练数据格式一致；以获取 LCQM dev 数据集中的句子 Embedding 和 token embedding 为例，示例脚本如下:
-```
-export FLAGS_sync_nccl_allreduce=1
-export CUDA_VISIBLE_DEVICES=7
-python -u ernie_encoder.py \
-                   --use_cuda true \
-                   --batch_size 32 \
-                   --output_dir "./test" \
-                   --init_pretraining_params ${MODEL_PATH}/params \
-                   --data_set ${TASK_DATA_PATH}/lcqmc/dev.tsv \
-                   --vocab_path config/vocab.txt \
-                   --max_seq_len 128 \
-                   --ernie_config_path config/ernie_config.json
-```
-上述脚本运行结束后，会在当前路径的 test 目录下分别生成 `cls_emb.npy` 文件存储句子 embeddings 和 `top_layer_emb.npy` 文件存储 token embeddings; 实际使用时，参照示例脚本修改数据路径、embeddings 文件存储路径等配置即可运行；
-#### 如何获取输入句子中每个 token 经过 ERNIE 编码后的 Embedding 表示？
-[解决方案同上](#如何获取输入句子经过-ERNIE-编码后的-Embedding-表示?)
-#### 如何利用 finetune 得到的模型对新数据进行批量预测？
-我们以分类任务为例，给出了分类任务进行批量预测的脚本, 使用示例如下:
-```
-python -u predict_classifier.py \
-       --use_cuda true \
-       --batch_size 32 \
-       --vocab_path config/vocab.txt \
-       --init_checkpoint "./checkpoints/step_100" \
-       --do_lower_case true \
-       --max_seq_len 128 \
-       --ernie_config_path config/ernie_config.json \
-       --do_predict true \
-       --predict_set ${TASK_DATA_PATH}/lcqmc/test.tsv \
-       --num_labels 2
-```
-实际使用时，需要通过 `init_checkpoint` 指定预测用的模型，通过 `predict_set` 指定待预测的数据文件，通过 `num_labels` 配置分类的类别数目;
-**Note**: predict_set 的数据格式是由 text_a、text_b(可选) 组成的1列/2列 tsv 文件;
--- a/README.md
+++ b/README.md
 English | [简体中文](./README.zh.md)
+**
+Try ERNIE with *eager execution*, please checkout to branch: `dygraph`.
+**
 ## ERNIE 2.0: A Continual Pre-training Framework for Language Understanding
@@ -16,8 +20,11 @@ English | [简体中文](./README.zh.md)
       * [IR Relevance Task](#ir-relevance-task)
 * [ERNIE 1.0: <strong>E</strong>nhanced <strong>R</strong>epresentation through k<strong>N</strong>owledge <strong>I</strong>nt<strong>E</strong>gration](#ernie-10-enhanced-representation-through-knowledge-integration)
 * [Compare the ERNIE 1.0 and ERNIE 2.0](#compare-the-ernie-10-and-ernie-20)
+ * [Results](#results)
   * [Results on English Datasets](#results-on-english-datasets)
   * [Results on Chinese Datasets](#results-on-chinese-datasets)
+ * [Communication](#communication)
+ * [Usage](#usage)
 ![ernie2.0_paper](.metas/ernie2.0_paper.png)
@@ -109,20 +116,6 @@ Integrating both phrase information and named entity information enables the mod
 | **Structure-aware** |                            | ✅ Sentence Reordering                                        | ✅ Sentence Reordering <br> ✅ Sentence Distance |
 | **Semantic-aware**  | ✅ Next Sentence Prediction | ✅ Discourse Relation                                         | ✅ Discourse Relation <br> ✅ IR Relevance  |
-## Release Notes
- July 30, 2019: release ERNIE 2.0
- Apr 10, 2019: update ERNIE_stable-1.0.1.tar.gz, update config and vocab
- Mar 18, 2019: update ERNIE_stable.tgz
- Mar 15, 2019: release ERNIE 1.0
-## Communication
- [Github Issues](https://github.com/PaddlePaddle/ERNIE/issues): bug reports, feature requests, install issues, usage issues, etc.
- QQ discussion group: 760439550 (ERNIE discussion group).
- [Forums](http://ai.baidu.com/forum/topic/list/168?pageNo=1): discuss implementations, research, etc.
 ## Results
@@ -339,7 +332,7 @@ XNLI is a natural language inference dataset in 15 languages. It was jointly bui
 *\*The DRCD dataset is converted from Traditional Chinese to Simplified Chinese based on tool: https://github.com/skydark/nstools/tree/master/zhtools*
-\* *The pre-training data of ERNIE 1.0 BASE does not contain instances whose length exceeds 128, but other models is pre-trained with the instances whose length are 512. It causes poorer performance of ERNIE 1.0 BASE on long-text tasks. So We have released [ERNIE 1.0 Base(max-len-512)](https://ernie.bj.bcebos.com/ERNIE_1.0_max-len-512.tar.gz) on July 29th, 2019*
+\* *The pre-training data of ERNIE 1.0 BASE does not contain instances whose length exceeds 128, but other models is pre-trained with the instances whose length are 512. It causes poorer performance of ERNIE 1.0 BASE on long-text tasks. So We have released [ERNIE 1.0 Base (max-len-512)](https://ernie.bj.bcebos.com/ERNIE_1.0_max-len-512.tar.gz) on July 29th, 2019*
@@ -371,7 +364,7 @@ DRCD is an open domain Traditional Chinese machine reading comprehension (MRC) d
    <tr>
      <th><strong>Dataset</strong>
        <br></th>
-      <th colspan="2"><center><strong>MSRA-NER(SIGHAN2006)</strong></center></th>
+      <th colspan="2"><center><strong>MSRA-NER (SIGHAN2006)</strong></center></th>
    <tr>
      <td rowspan="2">
        <p>
@@ -413,10 +406,10 @@ DRCD is an open domain Traditional Chinese machine reading comprehension (MRC) d
  </tbody>
 </table>
- - **MSRA-NER(SIGHAN2006)**
+ - **MSRA-NER (SIGHAN2006)**
 ```text
-MSRA-NER(SIGHAN2006) dataset is released by MSRA for recognizing the names of people, locations and organizations in text.
+MSRA-NER (SIGHAN2006) dataset is released by MSRA for recognizing the names of people, locations and organizations in text.
 ```
 #### Results on Sentiment Analysis Task
@@ -622,10 +615,17 @@ LCQMC is a Chinese question semantic matching corpus published in COLING2018. [u
 - **BQ Corpus**
 ```text
-BQ Corpus(Bank Question corpus) is a Chinese corpus for sentence semantic equivalence identification. This dataset was published in EMNLP 2018. [url: https://www.aclweb.org/anthology/D18-1536]
+BQ Corpus (Bank Question corpus) is a Chinese corpus for sentence semantic equivalence identification. This dataset was published in EMNLP 2018. [url: https://www.aclweb.org/anthology/D18-1536]
 ```
+## Communication
+- [Github Issues](https://github.com/PaddlePaddle/ERNIE/issues): bug reports, feature requests, install issues, usage issues, etc.
+- QQ discussion group: 760439550 (ERNIE discussion group).
+- [Forums](http://ai.baidu.com/forum/topic/list/168?pageNo=1): discuss implementations, research, etc.
 ## Usage
  * [Install PaddlePaddle](#install-paddlepaddle)
  * [Pre-trained Models &amp; Datasets](#pre-trained-models--datasets)
@@ -635,6 +635,7 @@ BQ Corpus(Bank Question corpus) is a Chinese corpus for sentence semantic equiva
        * [Chinese Datasets](#chinese-datasets)
  * [Fine-tuning](#fine-tuning)
     * [Batchsize and GPU Settings](#batchsize-and-gpu-settings)
+     * [Multiprocessing and fp16 auto mix-precision finetune](#multiprocessing-and-fp16-auto-mix-precision-finetune)
     * [Classification](#classification)
        * [Single Sentence Classification Tasks](#single-sentence-classification-tasks)
        * [Sentence Pair Classification Tasks](#sentence-pair-classification-tasks)
@@ -643,20 +644,22 @@ BQ Corpus(Bank Question corpus) is a Chinese corpus for sentence semantic equiva
     * [Machine Reading Comprehension](#machine-reading-comprehension)
  * [Pre-training with ERNIE 1.0](#pre-training-with-ernie-10)
     * [Data Preprocessing](#data-preprocessing)
-     * [PreTrain ERNIE1.0](#pretrain-ernie10)
+     * [Pretrain ERNIE1.0](#pretrain-ernie10)
+  * [Distillation](#distillation)
  * [FAQ](#faq)
     * [FAQ1: How to get sentence/tokens embedding of ERNIE?](#faq1-how-to-get-sentencetokens-embedding-of-ernie)
     * [FAQ2: How to predict on new data with Fine-tuning model?](#faq2-how-to-predict-on-new-data-with-fine-tuning-model)
     * [FAQ3: Is the  argument batch_size for one GPU card or for all GPU cards?](#faq3-is-the--argument-batch_size-for-one-gpu-card-or-for-all-gpu-cards)
     * [FAQ4: Can not find library: libcudnn.so. Please try to add the lib path to LD_LIBRARY_PATH.](#faq4-can-not-find-library-libcudnnso-please-try-to-add-the-lib-path-to-ld_library_path)
     * [FAQ5: Can not find library: libnccl.so. Please try to add the lib path to LD_LIBRARY_PATH.](#faq5-can-not-find-library-libncclso-please-try-to-add-the-lib-path-to-ld_library_path)
+     * [FQA6: Runtime error: `ModuleNotFoundError No module named propeller`](#faq6)
-## Install PaddlePaddle
+### Install PaddlePaddle
-This code base has been tested with Paddle Fluid 1.5.1 under Python2.
+This code base has been tested with Paddle Fluid 1.6 with Python 2/3.5+, since Paddle 1.6 has changed some of APIs, using version before 1.6 might have bug on NER tasks.
-**\*Important\*** When finished installing Paddle Fluid, remember to update LD_LIBRARY_PATH about CUDA, cuDNN, NCCL2, for more information, you can click [here](http://en.paddlepaddle.org/documentation/docs/en/1.5/beginners_guide/index_en.html) and [here](http://en.paddlepaddle.org/documentation/docs/en/1.5/beginners_guide/install/install_Ubuntu_en.html). Also, you can read FAQ at the end of this document when you encounter errors.
+**\*Important\*** When finished installing Paddle Fluid, remember to update LD_LIBRARY_PATH about CUDA, cuDNN, NCCL2, for more information on paddlepaddle setup, you can click [here](http://en.paddlepaddle.org/documentation/docs/en/1.5/beginners_guide/index_en.html) and [here](http://en.paddlepaddle.org/documentation/docs/en/1.5/beginners_guide/install/install_Ubuntu_en.html). Also, you can read FAQ at the end of this document when you encounter errors.
 For beginners of PaddlePaddle, the following documentation will tutor you about installing PaddlePaddle:
@@ -669,11 +672,15 @@ If you have been armed with certain level of deep learning knowledge, and it hap
 For more information about paddlepadde, Please refer to [PaddlePaddle Github](https://github.com/PaddlePaddle/Paddle) or [Official Website](https://www.paddlepaddle.org.cn/) for details.
+Other dependency of ERNIE is listed in `requirements.txt`, you can install it by
+```script
+pip install -r requirements.txt
+```
-## Pre-trained Models & Datasets
+### Pre-trained Models & Datasets
-### Models
+#### Models
 | Model                                              | Description                                                 |
 | :------------------------------------------------- | :----------------------------------------------------------- |
@@ -683,36 +690,36 @@ For more information about paddlepadde, Please refer to [PaddlePaddle Github](ht
 | [ERNIE 2.0 Base for English](https://ernie.bj.bcebos.com/ERNIE_Base_en_stable-2.0.0.tar.gz)   | with params, config and vocabs |
 | [ERNIE 2.0 Large for English](https://ernie.bj.bcebos.com/ERNIE_Large_en_stable-2.0.0.tar.gz) | with params, config and vocabs |
-### Datasets
+#### Datasets
-#### English Datasets
+##### English Datasets
 Download the [GLUE data](https://gluebenchmark.com/tasks) by running [this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e) and unpack it to some directory `${TASK_DATA_PATH}`
 After the dataset is downloaded, you should run `sh ./script/en_glue/preprocess/cvt.sh $TASK_DATA_PATH` to convert the data format for training. If everything goes well, there will be a folder named `glue_data_processed`  created with all the converted datas in it.
-#### Chinese Datasets
+##### Chinese Datasets
 You can download Chinese Datasets from [here](https://ernie.bj.bcebos.com/task_data_zh.tgz)
-## Fine-tuning
+#### Fine-tuning
-### Batchsize and GPU Settings
+##### Batchsize and GPU Settings
 In our experiments, we found that the batch size is important for different tasks. For users can more easily reproducing results, we list the batch size and gpu cards here:
 | Dataset      | Batch Size      | GPU                 |
 | ------------ | --------------- | ------------------- |
-| CoLA         | 32 / 64(base)   | 1                   |
+| CoLA         | 32 / 64 (base)   | 1                   |
-| SST-2        | 64 / 256(base)  | 8                   |
+| SST-2        | 64 / 256 (base)  | 8                   |
 | STS-B        | 128             | 8                   |
 | QQP          | 256             | 8                   |
-| MNLI         | 256 / 512(base) | 8                   |
+| MNLI         | 256 / 512 (base) | 8                   |
 | QNLI         | 256             | 8                   |
-| RTE          | 16 / 4(base)    | 1                   |
+| RTE          | 16 / 4 (base)    | 1                   |
-| MRPC         | 16 / 32(base)   | 2                   |
+| MRPC         | 16 / 32 (base)   | 2                   |
 | WNLI         | 8               | 1                   |
 | XNLI         | 65536 (tokens) | 8                   |
 | CMRC2018     | 64              | 8 (large) / 4(base) |
@@ -725,9 +732,20 @@ In our experiments, we found that the batch size is important for different task
 \* *For MNLI, QNLI，we used 32GB V100, for other tasks we used 22GB P40*
-### Classification
-#### Single Sentence Classification Tasks
+#### Multiprocessing and fp16 auto mix-precision finetune
+multiprocessing finetuning can be simply enabled with `finetune_launch.py`  in your finetune script.
+with multiprocessing finetune paddle can fully utilize your CPU/GPU capacity to accelerate finetuning.
+`finetune_launch.py` should place in front of your finetune command. make sure to provide number of process and device id per node by specifiying `--nproc_per_node` and `--selected_gpus`. Number of device ids should match `nproc_per_node` and `CUDA_VISIBLE_DEVICES`, and the indexing should start from 0.
+fp16 finetuning can be simply enable by specifing `--use_fp16 true` in your training script (make sure you use have a Tensor Core device). ERNIE will cast computation op to fp16 precision, while maintain storage in fp32 precision. approximately 60% speedup is seen on XNLI finetuning.
+dynamic loss scale is used to avoid gradient vanish.
+#### Classification
+##### Single Sentence Classification Tasks
 The code used to perform classification/regression finetuning is in `run_classifier.py`, we also provide the shell scripts for each task including best hyperpameters.
@@ -735,7 +753,7 @@ Take an English task `SST-2` and a Chinese task `ChnSentCorp` for example,
 Step1: Download and unarchive  the model in path `${MODEL_PATH}`, if everything goes well, there should be a folder named `params` in `$MODEL_PATH`;
-Step2: Download and unarchive the data set in `${TASK_DATA_PATH}`, for English tasks, there should be 9 folders named `CoLA` , `MNLI`,  `MRPC`,  `QNLI` , `QQP`,  `RTE` , `SST-2`,  `STS-B` , `WNLI`; for Chinese tasks, there should be 5 folders named  `lcqmc`, `xnli`, `msra-ner`, `chnsentcorp`,  `nlpcc-dbqa` in `${TASK_DATA_PATH}`;
+Step2: Download and unarchive the data set in `${TASK_DATA_PATH}`, for English tasks, there should be 9 folders named `CoLA` , `MNLI`,  `MRPC`,  `QNLI` , `QQP`,  `RTE` , `SST-2`,  `STS-B` , `WNLI`; for Chinese tasks, there should be 6 folders named  `cmrc2018` `drc`, `xnli`, `msra-ner`, `chnsentcorp`,  `nlpcc-dbqa` in `${TASK_DATA_PATH}`;
 Step3: Follow the instructions below based on your own task type for starting  your programs.
@@ -785,7 +803,7 @@ Similarly, for the Chinese task `ChnSentCorp`, after setting the environment var
-#### Sentence Pair Classification Tasks
+##### Sentence Pair Classification Tasks
 Take `RTE` as an example,  the data should have 3 fields `text_a    text_b   label` with tsv format. Here is some example datas:
 ```
@@ -821,9 +839,9 @@ testing ./data/test.tsv, save to output/test_out.5.2019-07-23-15-25-06.tsv.4.781
-### Sequence Labeling
+#### Sequence Labeling
-#### Named Entity Recognition
+##### Named Entity Recognition
 Take `MSRA-NER(SIGHAN2006)` as an example, the data should have 2 fields,  `text_a  label`, with tsv format. Here is some example datas :
 ```
@@ -840,7 +858,7 @@ Also, remember to set environmental variables like above, and run `sh script/zh_
 [test evaluation] f1: 0.937390, precision: 0.925988, recall: 0.949077, elapsed time: 36.565929 s
 ```
-### Machine Reading Comprehension
+#### Machine Reading Comprehension
 Take `DRCD` as an example, convert the data into SQUAD format firstly:
@@ -883,9 +901,9 @@ Also, remember to set environmental variables like above, and run `sh script/zh_
 ```
-## Pre-training with ERNIE 1.0
+### Pre-training with ERNIE 1.0
-### Data Preprocessing
+#### Data Preprocessing
 We construct the training dataset based on [Baidu Baike](https://en.wikipedia.org/wiki/Baidu_Baike), [Baidu Knows(Baidu Zhidao)](https://en.wikipedia.org/wiki/Baidu_Knows), [Baidu Tieba](https://en.wikipedia.org/wiki/Baidu_Tieba) for Chinese version ERNIE, and [Wikipedia](https://en.wikipedia.org/wiki/Wikipedia:Database_download), [Reddit](https://en.wikipedia.org/wiki/Reddit), [BookCorpus](https://github.com/soskek/bookcorpus) for English version ERNIE.
@@ -899,7 +917,7 @@ Here are some train instances after processing (which can be found in [`data/dem
 Each instance is composed of 5 fields, which are joined by `;`in one line, represented `token_ids; sentence_type_ids; position_ids; seg_labels; next_sentence_label` respectively. Especially, in the field`seg_labels`,  0 means the begin of one word, 1 means non-begin of one word, -1 means placeholder, the other number means  `CLS` or `SEP`.
-### PreTrain ERNIE 1.0
+#### Pretrain ERNIE 1.0
 The start entry for pretrain is  [`script/zh_task/pretrain.sh`](./script/zh_task/pretrain.sh). Before we run the train program, remember to set  CUDA、cuDNN、NCCL2 etc. in the environment variable LD_LIBRARY_PATH.
@@ -919,10 +937,15 @@ epoch: 1, progress: 1/1, step: 50, loss: 10.360563, ppl: 16398.287109, next_sent
 ```
+### Distillation
+ERNIE provide a toolkit for data distillation to further accelerate your ineference, see <a href="./distill/README.md">here</a> for detail
-## FAQ
-### FAQ1: How to get sentence/tokens embedding of ERNIE?
+### FAQ
+#### FAQ1: How to get sentence/tokens embedding of ERNIE?
 Run ```ernie_encoder.py ``` we can get the both sentence embedding and tokens embeddings. The input data format should be same as that mentioned in chapter [Fine-tuning](#fine-tuning).
@@ -932,7 +955,7 @@ Here is an example to get sentence embedding and token embedding for LCQMC dev d
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0
-python -u ernir_encoder.py \
+python -u ernie_encoder.py \
                   --use_cuda true \
                   --batch_size 32 \
                   --output_dir "./test" \
@@ -947,22 +970,18 @@ when finished running this script,  `cls_emb.npy` and `top_layer_emb.npy `will b
-### FAQ2: How to predict on new data with Fine-tuning model?
+#### FAQ2: How to predict on new data with Fine-tuning model?
 Take classification tasks for example, here is the script for batch prediction:
 ```
-python -u predict_classifier.py \
+python -u infer_classifyer.py \
-       --use_cuda true \
-       --batch_size 32 \
-       --vocab_path ${MODEL_PATH}/vocab.txt \
-       --init_checkpoint "./checkpoints/step_100" \
-       --do_lower_case true \
-       --max_seq_len 128 \
    --ernie_config_path ${MODEL_PATH}/ernie_config.json \
-       --do_predict true \
+    --init_checkpoint "./checkpoints/step_100" \
-       --predict_set ${TASK_DATA_PATH}/lcqmc/test.tsv \
+    --save_inference_model_path ./saved_model \
-       --num_labels 2
+    --predict_set  ${TASK_DATA_PATH}/xnli/test.tsv \
+    --vocab_path ${MODEL_PATH}/vocab.txt \
+    --num_labels 3 
 ```
 Argument  `init_checkpoint` is the path of the model, `predict_set` is the path of test file,  `num_labels` is the number of target labels.
@@ -971,18 +990,28 @@ Argument  `init_checkpoint` is the path of the model, `predict_set` is the path
-### FAQ3: Is the  argument batch_size for one GPU card or for all GPU cards?
+#### FAQ3: Is the  argument batch_size for one GPU card or for all GPU cards?
 For one GPU card.
-### FAQ4: Can not find library: libcudnn.so. Please try to add the lib path to LD_LIBRARY_PATH.
+#### FAQ4: Can not find library: libcudnn.so. Please try to add the lib path to LD_LIBRARY_PATH.
 Export the path of cuda to LD_LIBRARY_PATH, e.g.: `export LD_LIBRARY_PATH=/home/work/cudnn/cudnn_v[your cudnn version]/cuda/lib64`
-### FAQ5: Can not find library: libnccl.so. Please try to add the lib path to LD_LIBRARY_PATH.
+#### FAQ5: Can not find library: libnccl.so. Please try to add the lib path to LD_LIBRARY_PATH.
 Download [NCCL2](https://developer.nvidia.com/nccl/nccl-download), and export the library path to LD_LIBRARY_PATH, e.g.:`export LD_LIBRARY_PATH=/home/work/nccl/lib`
+### FAQ6: Runtime error: `ModuleNotFoundError No module named propeller`<a name="faq6"></a>
+you can import propeller to your PYTHONPATH by `export PYTHONPATH:./:$PYTHONPATH`
+#### FAQ7: Cannot malloc XXX MB GPU memory.
+Try to reduce the batch_size, reduce the max_seq_len and set FLAGS_eager_delete_tensor_gb=0.0
--- a/README.zh.md
+++ b/README.zh.md
 [English](./README.md) | 简体中文
+**
+欢迎试用*动态图*实现的ERNIE，请checkout 分支:`dygraph`
+**
 ## ERNIE 2.0: A Continual Pre-training Framework for Language Understanding
@@ -16,8 +20,12 @@
       * [IR Relevance Task](#ir-relevance-task)
 * [ERNIE 1.0: <strong>E</strong>nhanced <strong>R</strong>epresentation through k<strong>N</strong>owledge <strong>I</strong>nt<strong>E</strong>gration](#ernie-10-enhanced-representation-through-knowledge-integration)
 * [对比 ERNIE 1.0 和 ERNIE 2.0](#对比-ernie-10-和-ernie-20)
+ * [效果验证](#效果验证)
    * [中文效果验证](#中文效果验证)
    * [英文效果验证](#英文效果验证)
+ * [ERNIE tiny](#ernie-tiny)
+ * [技术交流](#技术交流)
+ * [使用](#使用)
 ![ernie2.0_paper](.metas/ernie2.0_paper.png)
@@ -105,26 +113,16 @@
 | **Semantic-aware**  | ✅ Next Sentence Prediction | ✅ Discourse Relation                                         | ✅ Discourse Relation <br> ✅ IR Relevance  |
-## 开源记录
- 2019-07-30 发布 ERNIE 2.0
- 2019-04-10 更新: update ERNIE_stable-1.0.1.tar.gz, 将模型参数、配置 ernie_config.json、vocab.txt 打包发布
- 2019-03-18 更新: update ERNIE_stable.tgz
- 2019-03-15 发布 ERNIE 1.0
-## 技术交流
- [Github Issues](https://github.com/PaddlePaddle/ERNIE/issues): bug reports, feature requests, install issues, usage issues, etc.
- ERNIE QQ 群: 760439550 (ERNIE discussion group).
- [论坛](http://ai.baidu.com/forum/topic/list/168?pageNo=1): discuss implementations, research, etc.
+## 效果验证
-## 中文效果验证
+### 中文效果验证
 我们在 9 个任务上验证 ERNIE 2.0 中文模型的效果。这些任务包括：自然语言推断任务 XNLI；阅读理解任务 DRCD、DuReader、CMRC2018；命名实体识别任务 MSRA-NER (SIGHAN2006)；情感分析任务 ChnSentiCorp；语义相似度任务 BQ Corpus、LCQMC；问答任务 NLPCC2016-DBQA 。任务的详情和效果会在如下章节中介绍。
-### 自然语言推断任务
+#### 自然语言推断任务
 <table>
  <tbody>
@@ -189,7 +187,7 @@
 XNLI 是由 Facebook 和纽约大学的研究者联合构建的自然语言推断数据集，包括 15 种语言的数据。我们用其中的中文数据来评估模型的语言理解能力。[链接: https://github.com/facebookresearch/XNLI]
 ```
-### 阅读理解任务
+#### 阅读理解任务
 <table>
  <tbody>
@@ -318,9 +316,7 @@ CMRC2018 是中文信息学会举办的评测，评测的任务是抽取类阅
 DRCD 是台达研究院发布的繁体中文阅读理解数据集，目标是从篇章中抽取出连续片段作为答案。我们在实验时先将其转换成简体中文。[链接: https://github.com/DRCKnowledgeTeam/DRCD]
 ```
+#### 命名实体识别任务
-### 命名实体识别任务
 <table>
  <tbody>
@@ -371,15 +367,13 @@ DRCD 是台达研究院发布的繁体中文阅读理解数据集，目标是从
  </tbody>
 </table>
- - **MSRA-NER(SIGHAN2006)**
+ - **MSRA-NER (SIGHAN2006)**
 ```text
-MSRA-NER(SIGHAN2006) 数据集由微软亚研院发布，其目标是识别文本中具有特定意义的实体，包括人名、地名、机构名。
+MSRA-NER (SIGHAN2006) 数据集由微软亚研院发布，其目标是识别文本中具有特定意义的实体，包括人名、地名、机构名。
 ```
+#### 情感分析任务
-### 情感分析任务
 <table>
  <tbody>
@@ -436,9 +430,7 @@ MSRA-NER(SIGHAN2006) 数据集由微软亚研院发布，其目标是识别文
 ChnSentiCorp 是一个中文情感分析数据集，包含酒店、笔记本电脑和书籍的网购评论。
 ```
+#### 问答任务
-### 问答任务
 <table>
  <tbody>
@@ -512,9 +504,7 @@ ChnSentiCorp 是一个中文情感分析数据集，包含酒店、笔记本电
 NLPCC2016-DBQA 是由国际自然语言处理和中文计算会议 NLPCC 于 2016 年举办的评测任务，其目标是从候选中找到合适的文档作为问题的答案。[链接: http://tcci.ccf.org.cn/conference/2016/dldoc/evagline2.pdf]
 ```
+#### 语义相似度
-### 语义相似度
 <table>
  <tbody>
@@ -597,14 +587,13 @@ BQ Corpus 是在自然语言处理国际顶会 EMNLP 2018 发布的语义匹配
-##  英文效果验证
+###  英文效果验证
 ERNIE 2.0 的英文效果验证在 GLUE 上进行。GLUE 评测的官方地址为  https://gluebenchmark.com/ ，该评测涵盖了不同类型任务的 10 个数据集，其中包含 11 个测试集，涉及到 Accuracy, F1-score, Spearman Corr,. Pearson Corr,. Matthew Corr., 5 类指标。GLUE 排行榜使用每个数据集的平均分作为总体得分，并以此为依据将不同算法进行排名。
+#### GLUE - 验证集结果
-### GLUE - 验证集结果
 | <strong>数据集</strong> | <strong>CoLA</strong> | <strong>SST-2</strong> | <strong>MRPC</strong> | <strong>STS-B</strong> | <strong>QQP</strong>  | <strong>MNLI-m</strong> | <strong>QNLI</strong> | <strong>RTE</strong>  |
 | ----------- | ---- | ----- | ---- | ----- | ---- | ---- | ---- | ---- |
@@ -617,7 +606,7 @@ ERNIE 2.0 的英文效果验证在 GLUE 上进行。GLUE 评测的官方地址
-### GLUE - 测试集结果
+#### GLUE - 测试集结果
 | <strong>数据集</strong> | - | <strong>CoLA</strong> | <strong>SST-2</strong> | <strong>MRPC</strong> | <strong>STS-B</strong> | <strong>QQP</strong>  | <strong>MNLI-m</strong> | <strong>MNLI-mm</strong> | <strong>QNLI</strong> | <strong>RTE</strong>  | <strong>WNLI</strong> |<strong>AX</strong>|
 | ----------- | ----- | ---- | ----- | ---- | ----- | ---- | ------ | ------- | ---- | ---- | ---- | ---- |
@@ -631,6 +620,42 @@ ERNIE 2.0 的英文效果验证在 GLUE 上进行。GLUE 评测的官方地址
 由于 XLNet 暂未公布 GLUE 测试集上的单模型结果，所以我们只与 BERT 进行单模型比较。上表为ERNIE 2.0 单模型在 GLUE 测试集的表现结果。
+### ERNIE tiny
+为了提升ERNIE模型在实际工业应用中的落地能力，我们推出ERNIE-tiny模型。 
+![ernie_tiny](.metas/ernie_tiny.png)
+ERNIE-tiny作为小型化ERNIE，采用了以下4点技术，保证了在实际真实数据中将近4.3倍的预测提速。
+1. 浅：12层的ERNIE Base模型直接压缩为3层，线性提速4倍，但效果也会有较大幅度的下降；
+1. 胖：模型变浅带来的损失可通过hidden size的增大来弥补。由于fluid inference框架对于通用矩阵运算（gemm）的最后一维（hidden size）参数的不同取值会有深度的优化，因为将hidden size从768提升至1024并不会带来速度线性的增加；
+1. 短：ERNIE Tiny是首个开源的中文subword粒度的预训练模型。这里的短是指通过subword粒度替换字（char）粒度，能够明显地缩短输入文本的长度，而输入文本长度是和预测速度有线性相关。统计表明，在XNLI dev集上采用subword字典切分出来的序列长度比字表平均缩短40%；
+1. 萃：为了进一步提升模型的效果，ERNIE Tiny扮演学生角色，利用模型蒸馏的方式在Transformer层和Prediction层去学习教师模型ERNIE模型对应层的分布或输出，这种方式能够缩近ERNIE Tiny和ERNIE的效果差异。
+#### Benchmark
+ERNIE Tiny轻量级模型在公开数据集的效果如下所示，任务均值相对于ERNIE Base只下降了2.37%，但相对于“SOTA Before BERT”提升了8%。在延迟测试中，ERNIE Tiny能够带来4.3倍的速度提升
+（测试环境为：GPU P4，Paddle Inference C++ API，XNLI Dev集，最大maxlen=128，测试结果10次均值）
+|model|XNLI(acc)|LCQCM(acc)|CHNSENTICORP(acc)|NLPCC-DBQA(mrr/f1)|Average|Latency
+|--|--|--|--|--|--|--|
+|SOTA-before-ERNIE|68.3|83.4|92.2|72.01/-|78.98|-|
+|ERNIE2.0-base|79.7|87.9|95.5|95.7/85.3|89.70|633ms(1x)|
+|ERNIE-tiny-subword|75.1|86.1|95.2|92.9/78.6|87.33|146ms(4.3x)|
+## 技术交流
+- [Github Issues](https://github.com/PaddlePaddle/ERNIE/issues): bug reports, feature requests, install issues, usage issues, etc.
+- ERNIE QQ 群: 760439550 (ERNIE discussion group).
+- [论坛](http://ai.baidu.com/forum/topic/list/168?pageNo=1): discuss implementations, research, etc.
 ## 使用
  * [PaddlePaddle 安装](#paddlepaddle安装)
  * [模型&amp;数据](#模型数据)
@@ -640,28 +665,37 @@ ERNIE 2.0 的英文效果验证在 GLUE 上进行。GLUE 评测的官方地址
        * [英文数据](#英文数据)
  * [Fine-tuning 任务](#fine-tuning-任务)
     * [运行参数配置](#运行参数配置)
+     * [多进程训练与fp16混合精度](#多进程训练与fp16混合精度)
     * [单句和句对分类任务](#单句和句对分类任务)
        * [单句分类任务](#单句分类任务)
        * [句对分类任务](#句对分类任务)
     * [序列标注任务](#序列标注任务)
        * [实体识别](#实体识别)
     * [阅读理解任务](#阅读理解任务-1)
+     * [ERNIE tiny](#tune-ernie-tiny)
+  * [利用Propeller进行二次开发](#利用propeller进行二次开发)
  * [预训练 (ERNIE 1.0)](#预训练-ernie-10)
     * [数据预处理](#数据预处理)
     * [开始训练](#开始训练)
+  * [向量服务器](#向量服务器)
+  * [蒸馏](#蒸馏)
+  * [上线](#上线)
+       * [生成inference_model](#生成inference_model)
+       * [在线预测](#在线预测)
  * [FAQ](#faq)
     * [FAQ1: 如何获取输入句子/词经过 ERNIE 编码后的 Embedding 表示?](#faq1-如何获取输入句子词经过-ernie-编码后的-embedding-表示)
     * [FAQ2: 如何利用 Fine-tuning 得到的模型对新数据进行批量预测？](#faq2-如何利用-fine-tuning-得到的模型对新数据进行批量预测)
     * [FAQ3: 运行脚本中的batch size指的是单卡分配的数据量还是多卡的总数据量？](#faq3-运行脚本中的batch-size指的是单卡分配的数据量还是多卡的总数据量)
     * [FAQ4: Can not find library: libcudnn.so. Please try to add the lib path to LD_LIBRARY_PATH.](#faq4-can-not-find-library-libcudnnso-please-try-to-add-the-lib-path-to-ld_library_path)
     * [FAQ5: Can not find library: libnccl.so. Please try to add the lib path to LD_LIBRARY_PATH.](#faq5-can-not-find-library-libncclso-please-try-to-add-the-lib-path-to-ld_library_path)
+     * [FQA6: 运行报错`ModuleNotFoundError: No module named 'propeller'`](#faq6)
 ## PaddlePaddle安装
-本项目依赖于 Paddle Fluid 1.5，请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。
+本项目依赖于 Paddle 1.6，* 由于Paddle 1.6版本相比之前版本有较大API改动，使用Paddle 1.6以前版本运行本代码库会导致序列标注等任务报错 *，请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。
-**【重要】安装后，需要及时的将 CUDA、cuDNN、NCCL2 等动态库路径加入到环境变量 LD_LIBRARY_PATH 之中，否则训练过程中会报相关的库错误。具体的安装细节请查阅[这里](http://en.paddlepaddle.org/documentation/docs/zh/1.5/beginners_guide/quick_start_cn.html)**
+**【重要】安装后，需要及时的将 CUDA、cuDNN、NCCL2 等动态库路径加入到环境变量 LD_LIBRARY_PATH 之中，否则训练过程中会报相关的库错误。具体的paddlepaddle配置细节请查阅[这里](http://en.paddlepaddle.org/documentation/docs/zh/1.5/beginners_guide/quick_start_cn.html)**
 如果您想了解更多的 Paddle 的相关信息，例如针对实际问题建模、搭建自己网络等，这里有更多的来自官方的文档供您参考：
@@ -671,6 +705,11 @@ ERNIE 2.0 的英文效果验证在 GLUE 上进行。GLUE 评测的官方地址
 > - [训练神经网络](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/user_guides/howto/training/index_cn.html)：介绍如何使用 Fluid 进行单机训练、多机训练、以及保存和载入模型变量
 > - [模型评估与调试](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/user_guides/howto/evaluation_and_debugging/index_cn.html)：介绍在 Fluid 下进行模型评估和调试的方法
+ERNIE的其他依赖列在`requirements.txt`文件中，使用以下命令安装
+```script
+pip install -r requirements.txt
+```
 ## 模型&数据
@@ -683,6 +722,7 @@ ERNIE 2.0 的英文效果验证在 GLUE 上进行。GLUE 评测的官方地址
 | [ERNIE 1.0 中文 Base 模型(max_len=512)](https://ernie.bj.bcebos.com/ERNIE_1.0_max-len-512.tar.gz) | 包含预训练模型参数、词典 vocab.txt、模型配置 ernie_config.json|
 | [ERNIE 2.0 英文 Base 模型](https://ernie.bj.bcebos.com/ERNIE_Base_en_stable-2.0.0.tar.gz) | 包含预训练模型参数、词典 vocab.txt、模型配置 ernie_config.json|
 | [ERNIE 2.0 英文 Large 模型](https://ernie.bj.bcebos.com/ERNIE_Large_en_stable-2.0.0.tar.gz) | 包含预训练模型参数、词典 vocab.txt、模型配置 ernie_config.json|
+| [ERNIE tiny 中文模型](https://ernie.bj.bcebos.com/ernie_tiny.tar.gz)|包含预训练模型参数、词典 vocab.txt、模型配置 ernie_config.json 以及切词词表|
@@ -720,8 +760,8 @@ ERNIE 2.0 的英文效果验证在 GLUE 上进行。GLUE 评测的官方地址
 | MRPC   | 16 / 32 (base) | 2 |
 | WNLI | 8 | 1 |
 | XNLI | 65536 (tokens) | 8 |
-| CMRC2018 | 64 | 8 (large) / 4(base) |
+| CMRC2018 | 64 | 8 (large) / 4 (base) |
-| DRCD | 64 | 8 (large) / 4(base) |
+| DRCD | 64 | 8 (large) / 4 (base) |
 | MSRA-NER(SIGHAN 2006) | 16 | 1 |
 | ChnSentiCorp | 24 | 1 |
 | LCQMC | 32 | 1 |
@@ -731,6 +771,12 @@ ERNIE 2.0 的英文效果验证在 GLUE 上进行。GLUE 评测的官方地址
 \* *MNLI 和 QNLI 的任务中，使用了 32 GB 显存的 V100。除此之外的显卡皆为22 GB 的 P40。*
+### 多进程训练与fp16混合精度
+使用`finetune_launch.py`脚本来启动多进程训练 。多进程训练可以提升充分利用多核CPU/多卡GPU 的能力来加速finetune过程。
+`finetune_launch.py` 需要放在原来finetune脚本前面, 同时指定每个节点的进程数`--nproc_per_node`, 以及每个节点上的gpu卡号`--selected_gpus`, 一般数量与进程数, `CUDA_VISIBLE_DEVICES`相同且从0开始编号 (参考`script/zh_task/ernie_base/run_xnli.sh`)
+只需在训练脚本中加入`--use_fp16 true`即可启用fp16混合精度训练（确保您的硬件支持Tensor Core技术）。ERNIE会将计算Op转换成fp16精度，同时仍然使用fp32精度存储参数。ERNIE使用动态loss scale来避免梯度消失。在XNLI任务上可以观察到大约60%加速。
 ### 单句和句对分类任务
@@ -877,11 +923,69 @@ text_a  label
 ```
+### ERNIE tiny <a name="tune-ernie-tiny"></a>
+ERNIE tiny 模型采用了subword粒度输入，需要在数据前处理中加入切词(segmentation)并使用[sentence piece](https://github.com/google/sentencepiece)进行tokenization. 
+segmentation 以及 tokenization 需要使用的模型包含在了 ERNIE tiny 的[预训练模型文件](#预训练模型下载)中，分别是 `./subword/dict.wordseg.pickle` 和 `./subword/spm_cased_simp_sampled.model`.
+目前`./example/`下的代码针对 ERNIE tiny 的前处理进行了适配只需在脚本中通过 `--sentence_piece_model` 引入tokenization 模型，再通过 `--word_dict` 引入 segmentation 模型之后即可进行 ERNIE tiny 的 Fine-tune。
+对于命名实体识别类型的任务，为了跟输入标注对齐，ERNIE tiny 仍然采用中文单字粒度进行作为输入。因此使用 `./example/finetune_ner.py` 时只需要打开 `--use_sentence_piece_vocab` 即可。
+具体的使用方法可以参考[下节](#利用propeller进行二次开发).
+## 利用Propeller进行二次开发
+[Propeller](./propeller/README.md) 是基于PaddlePaddle构建的一键式训练API，对于具备一定机器学习应用经验的开发者可以使用Propeller获得定制化开发体验。
+您可以通过`export PYTHONPATH=./:$PYTHONPATH`的方式引入Propeller.
+Propeller基础教程可以参考`./example/propeller_xnli_demo.ipynb`. 
+您只需定义好自己的模型以及 Dataset， 剩下的工作，如多卡并行，模型存储等等，都交给Propeller来处理吧。
+./example/ 里放了使用Propeller进行分类任务、排序任务和命名实体识别任务的finetune流程，可以作为您修改的模板。
+模板中使用的demo数据可以从[这里](https://ernie.bj.bcebos.com/propeller_demo_data.tar.gz)下载，解压完成后放到 ${TASK_DATA_PATH} 中。
+以分类任务为例，用下面脚本即可启动finetune，在训练的过程中框架会自动把准确率最好的模型保存在 `./output/best/inference` 下面。利用 infernce\_model 进行在线预测的方案请参考: [在线预测](#在线预测)
+```script
+python3 ./example/finetune_classifier.py \
+    --data_dir ${TASK_DATA_PATH}/chnsenticorp/  \
+    --warm_start_from ${MODEL_PATH}/params \
+    --vocab_file ${MODEL_PATH}/vocab.txt \
+    --max_seqlen 128 \
+    --run_config '{
+        "model_dir": "output",
+        "max_steps": '$((10 * 9600 / 32))',
+        "save_steps": 100,
+        "log_steps": 10,
+        "max_ckpt": 1,
+        "skip_steps": 0,
+        "eval_steps": 100
+    }' \
+    --hparam ${MODEL_PATH}/ernie_config.json \
+    --hparam '{ # model definition
+        "sent_type_vocab_size": None,    # default term in official config
+        "use_task_id": False,
+        "task_id": 0,
+        }' \
+    --hparam '{ # learn
+      "warmup_proportion":  0.1,
+      "weight_decay": 0.01,
+      "use_fp16": 0,
+      "learning_rate": 0.00005,
+      "num_label": 2,
+      "batch_size": 32
+    }'
+```
+finetune完成后，在上述脚本中加入--do_predict 参数后即可启动开始预测：
+```script
+cat input_file | python3 ./example/finetune_classifier.py --do_predict ... > output_score
+```
 ## 预训练 (ERNIE 1.0)
 ### 数据预处理
-基于百科类、资讯类、论坛对话类数据构造具有上下文关系的句子对数据，利用百度内部词法分析工具对句对数据进行字、词、实体等不同粒度的切分，然后基于 [`tokenization.py`](tokenization.py) 中的 CharTokenizer 对切分后的数据进行 token 化处理，得到明文的 token 序列及切分边界，然后将明文数据根据词典 [`config/vocab.txt`](config/vocab.txt) 映射为 id 数据，在训练过程中，根据切分边界对连续的 token 进行随机 mask 操作；
+基于百科类、资讯类、论坛对话类数据构造具有上下文关系的句子对数据，利用百度内部词法分析工具对句对数据进行字、词、实体等不同粒度的切分，然后基于 [`tokenization.py`](./ernie/tokenization.py) 中的 CharTokenizer 对切分后的数据进行 token 化处理，得到明文的 token 序列及切分边界，然后将明文数据根据词典 [`config/vocab.txt`](config/vocab.txt) 映射为 id 数据，在训练过程中，根据切分边界对连续的 token 进行随机 mask 操作；
 我们给出了 id 化后的部分训练数据：[`data/demo_train_set.gz`](./data/demo_train_set.gz)、和测试数据：[`data/demo_valid_set.gz`](./data/demo_valid_set.gz)，每行数据为1个训练样本，示例如下:
@@ -911,11 +1015,75 @@ epoch: 1, progress: 1/1, step: 50, loss: 10.360563, ppl: 16398.287109, next_sent
 如果用自定义的真实数据进行训练，请参照[`script/zh_task/pretrain.sh`](./script/zh_task/pretrain.sh)脚本对参数做相应修改。
+## 向量服务器
+经过预训练的 ERNIE 模型能够直接用于文本语义表示。模型预测的句子 embedding 可以很方便地应用于语义近邻搜索(ANN)， 或者下游任务feature-based finetune 任务中。为了更方便地将 ERNIE 用作特征抽取器，我们提供了一个ERNIE server来完成这项工作。
+ERNIE server 依赖propeller，
+您可以通过`export PYTHONPATH=./:$PYTHONPATH`的方式引入Propeller.
+请从 [这里](https://ernie.bj.bcebos.com/ernie1.0_zh_inference_model.tar.gz) 下载中文 ERNIE1.0-base 模型的 inference\_model, 随后您可以通过下面的指令启动ERNIE server服务
+```script
+python3 ernie/service/encoder_server.py -m ./ernie1.0_base_inference_model/ -p 8888 -v --encode_layer pooler
+```
+通过 `--encode_layer` 可以指定特征抽取的位置，`pooler` 代表选取 ERNIE pooler fc 的输出作为特征。
+您可以通过下面的方式请求ERNIE server服务，目前客户端支持python3调用：
+```python
+from ernie.service.client import ErnieClient
+client = ErnieClient('./config/vocab.txt', host='localhost', port=8888)
+ret = client(['谁有狂三这张高清的', '英雄联盟什么英雄最好']) # 单句输入
+# output:
+# array([[-1.        , -1.        ,  0.9937699 , ..., -0.99991065,
+#        -0.9999997 , -0.9999985 ],
+#       [-1.        , -1.        , -0.05038145, ..., -0.9912302 ,
+#        -0.9999436 , -0.9739356 ]], dtype=float32)
+ret = client(['谁有狂三这张高清的', '这张高清图，谁有'], ['英雄联盟什么英雄最好', '英雄联盟最好英雄是什么']) # 句对输入
+# output:
+# array([[-1.        , -0.99528974, -0.99174845, ..., -0.9781673 ,
+#        -1.        , -1.        ],
+#       [-1.        , -1.        , -0.8699475 , ..., -0.997155  ,
+#        -1.        , -0.99999994]], dtype=float32)
+```
+## 蒸馏
+ERNIE提供了通过数据蒸馏从而达到模型压缩、加速的开发套件，具体开发流程请参考 <a href="./distill/README.md">这里</a>
+## 上线
+完成finetune之后只需几步操作即可生成inference\_model, PaddlePaddle可以在生产环境中加载生成的预测模型并进行高效地预测。
+### 生成inference\_model
+运行`infer_classifyer.py`  脚本时通过指定 `--save_inference_model_path` 便可生成 inference_model 到指定位置。 
+如果您采用 `propeller` 完成finetune，则 `BestInferenceExporter` 会在finetune过程中根据预测指标，挑最好的模型生成 inference_model .
+### 在线预测
+随后您可以使用[ERNIE fast inference C++ API](./inference/README.md)将模型的前向预测代码联编到您的生产环境中。或者您可以使用我们为您构建好的python预测引擎来完成一个简单的服务。执行如下指令，便可以开启一个propeller server：
+```script
+python -m propeller.tools.start_server -m /path/to/saved/model -p 8888
+```
+您可以在python脚本很方便地调用propeller server:
+```python
+from propeller.service.client import InferenceClient
+client = InferenceClient('tcp://localhost:8888')
+sentence_id = np.array([[[20], [1560], [1175], [8], [42]]], dtype=np.int64)
+position_id = np.array([[[0], [1], [2], [3], [4]]], dtype=np.int64)
+token_type_id = np.array([[[0], [0], [0], [1], [1]]], dtype=np.int64)
+input_mask = np.array([[1., 1., 1., 1., 1.]], dtype=np.float32)
+result = client(sentence_id, token_type_id, position_id, input_mask)
+```
+`client`的请求参数类型是numpy array,对应了save_inference_model时指定的输入tensor. 如果是使用`infer_classifyer.py` 生成的inference_model则请求参数有四个：(sentence_id, position_id, token_type_id, input_mask)。 如果是`propeller` 生成的inference_model, client的请求参数对应您`eval_dataset` 的元素类型。目前`InferenceClient`只支持在python3环境下使用。
 ## FAQ
 ### FAQ1: 如何获取输入句子/词经过 ERNIE 编码后的 Embedding 表示?
-可以通过 ernie_encoder.py 抽取出输入句子的 Embedding 表示和句子中每个 token 的 Embedding 表示，数据格式和 [Fine-tuning 任务](#fine-tuning-任务) 一节中介绍的各种类型 Fine-tuning 任务的训练数据格式一致；以获取 LCQMC dev 数据集中的句子 Embedding 和 token embedding 为例，示例脚本如下:
+可以通过 `ernie_encoder.py` 抽取出输入句子的 Embedding 表示和句子中每个 token 的 Embedding 表示，数据格式和 [Fine-tuning 任务](#fine-tuning-任务) 一节中介绍的各种类型 Fine-tuning 任务的训练数据格式一致；以获取 LCQMC dev 数据集中的句子 Embedding 和 token embedding 为例，示例脚本如下:
 ```
 export FLAGS_sync_nccl_allreduce=1
@@ -940,17 +1108,14 @@ python -u ernie_encoder.py \
 我们以分类任务为例，给出了分类任务进行批量预测的脚本, 使用示例如下:
 ```
-python -u predict_classifier.py \
+python -u infer_classifyer.py \
-       --use_cuda true \
-       --batch_size 32 \
-       --vocab_path ${MODEL_PATH}/vocab.txt \
-       --init_checkpoint "./checkpoints/step_100" \
-       --do_lower_case true \
-       --max_seq_len 128 \
    --ernie_config_path ${MODEL_PATH}/ernie_config.json \
-       --do_predict true \
+    --init_checkpoint "./checkpoints/step_100" \
-       --predict_set ${TASK_DATA_PATH}/lcqmc/test.tsv \
+    --save_inference_model_path ./saved_model \
-       --num_labels 2
+    --predict_set  ${TASK_DATA_PATH}/xnli/test.tsv \
+    --vocab_path ${MODEL_PATH}/vocab.txt \
+    --num_labels 3 
 ```
 实际使用时，需要通过 `init_checkpoint` 指定预测用的模型，通过 `predict_set` 指定待预测的数据文件，通过 `num_labels` 配置分类的类别数目;
@@ -958,19 +1123,24 @@ python -u predict_classifier.py \
 **Note**: predict_set 的数据格式是由 text_a、text_b(可选) 组成的 1 列 / 2 列 tsv 文件。
 ### FAQ3: 运行脚本中的batch size指的是单卡分配的数据量还是多卡的总数据量？
 单独一张显卡分配到的数据量。
 ### FAQ4: Can not find library: libcudnn.so. Please try to add the lib path to LD_LIBRARY_PATH.
 在 LD_LIBRARY_PATH 中添加 cudnn 库的路径，如 `export LD_LIBRARY_PATH=/home/work/cudnn/cudnn_v[your cudnn version]/cuda/lib64`
 ### FAQ5: Can not find library: libnccl.so. Please try to add the lib path to LD_LIBRARY_PATH.
 需要先下载 [NCCL](https://developer.nvidia.com/nccl/nccl-download)，然后在 LD_LIBRARY_PATH 中添加 NCCL 库的路径，如`export LD_LIBRARY_PATH=/home/work/nccl/lib`
+### FAQ6: 运行报错`ModuleNotFoundError: No module named 'propeller'`<a name="faq6"></a>
+您可以通过`export PYTHONPATH=./:$PYTHONPATH`的方式引入Propeller.
+### FAQ7：显存不足报错: Cannot malloc XXX MB GPU memory. 
+您可以通过减小batch_size, 减小max_seq_len, 设置FLAGS_eager_delete_tensor_gb=0.0来减小网络对显存的需求
--- a/distill/README.md
+++ b/distill/README.md
+* [ERNIE Slim 数据蒸馏](#ernie-slim-数据蒸馏)
+    * [ERNIE数据蒸馏三步](#ernie数据蒸馏三步)
+    * [数据增强](#数据增强)
+* [使用教程](#使用教程)
+   * [离线蒸馏](#离线蒸馏)
+   * [在线蒸馏](#在线蒸馏)
+* [效果验证](#效果验证)
+    * [Case#1 用户提供“无标注数据”](#case1)
+    * [Case#2 用户未提供“无标注数据”](#case2)
+* [FAQ](#faq)
+# ERNIE Slim 数据蒸馏
+在ERNIE强大的语义理解能力背后，是需要同样强大的算力才能支撑起如此大规模模型的训练和预测。很多工业应用场景对性能要求较高，若不能有效压缩则无法实际应用。
+![ernie_distill](../.metas/ernie_distill.png)
+因此，如上图所示，我们基于[数据蒸馏技术](https://arxiv.org/pdf/1712.04440.pdf)构建了**ERNIE Slim数据蒸馏系统**。它的原理是通过数据作为桥梁，将ERNIE模型的知识迁移至小模型，以达到损失很小的效果却能达到上千倍的预测速度提升的效果。
+### ERNIE数据蒸馏三步
+ - **Step 1**. 使用ERNIE模型对输入标注数据对进行fine-tune，得到Teacher Model
+ - **Step 2**. 使用ERNIE Service对以下无监督数据进行预测：
+   1. 用户提供的大规模无标注数据，需与标注数据同源
+   2. 对标注数据进行数据增强，具体增强策略见下节
+   3. 对无标注数据和数据增强数据进行一定比例混合 
+ - **Step 3.** 使用步骤2的数据训练出Student Model
+### 数据增强
+目前采用三种[数据增强策略](https://arxiv.org/pdf/1903.12136.pdf)策略，对于不用的任务可以特定的比例混合。三种数据增强策略包括：
+ 1. 添加噪声：对原始样本中的词，以一定的概率（如0.1）替换为”UNK”标签
+ 2. 同词性词替换：对原始样本中的所有词，以一定的概率（如0.1）替换为本数据集钟随机一个同词性的词
+ 3. N-sampling：从原始样本中，随机选取位置截取长度为m的片段作为新的样本，其中片段的长度m为0到原始样本长度之间的随机值
+# 使用教程
+我们采用上述3种增强策略制作了chnsenticorp的增强数据：增强后的数据为原训练数据的10倍(96000行)，可以从[这里](https://ernie.bj.bcebos.com/distill_data.tar.gz)下载。将下载的 `distill` 文件夹放入 `${TASK_DATA_PATH}` 后即可执行下面的脚本开始蒸馏。
+### 离线蒸馏
+离线蒸馏指的是先通过训练好的ERNIE模型预测出无监督数据的label，然后student模型去学习这些label。只需执行
+```script
+sh ./distill/script/distill_chnsenticorp.sh
+```
+即可开始离线蒸馏。
+该脚本会进行前述的三步：1. 在任务数据上Fine-tune。 2. 加载Fine-tune好的模型对增强数据进行打分。 3.使用Student模型进行训练。脚本采用hard-label蒸馏，在第二步中将会直接预测出ERNIE标注的label。
+该脚本涉及两个python文件:`./example/finetune_classifier.py` 负责finetune以及预测teacher模型， `distill/distill_chnsentocorp.py` 负责student模型的训练。事先构造好的增强数据放在`${TASK_DATA_PATH}/distill/chnsenticorp/student/unsup_train_aug`
+在脚本的第二步中，使用 `--do_predict` 参数进入预测模式:
+```script
+cat ${TASK_DATA_PATH}/distill/chnsenticorp/student/unsup_train_aug/part.0 |python3 -u ./example/finetune_classifier.py \
+    --do_predict \
+    --data_dir ${TASK_DATA_PATH}/distill/chnsenticorp/teacher \
+    --warm_start_from ${MODEL_PATH}/params \
+    --vocab_file ${MODEL_PATH}/vocab.txt \
+    ...
+```
+脚本从标准输入获取明文输入，并将打分输出到标准输出。用这种方式对数据增强后的无监督训练预料进行标注。最终的标注结果放在 `prediction_output/part.0` 文件中。标注结果包含两列, 第一列为明文，第二列为标注label。
+在第三步开始student模型的训练：
+```script
+python3 ./distill/distill_chnsentocorp.py \
+    --data_dir ${TASK_DATA_PATH}/distill/chnsenticorp/student \
+    --vocab_file ${TASK_DATA_PATH}/distill/chnsenticorp/student/vocab.txt \
+    --unsupervise_data_dir ./prediction_output/ \
+    --max_seqlen 128 \
+    ...
+```
+训练流程与第一步相同，`--data_dir` 指定的监督数据，`--unsupervise_data_dir` 指定ERNIE标注数据。Student模型是一个简单的BOW模型，其定义位于`distill/distill_chnsentocorp.py`。用户只需改写其中的model部分即可实现定制蒸馏模型。
+如果用户已经拥有了无监督数据，则可以将无监督数据放入 `${TASK_DATA_PATH}/distill/chnsenticorp/student/unsup_train_aug` 即可。
+### 在线蒸馏
+考虑到在某些场景下，无监督数据过大导致预测过程十分耗时，或者ERNIE预测出的分布过大而无法预先存放在磁盘中。针对这种场景我们提出一种 **在线蒸馏** 方案。采用`propeller` 进行fine-tune并使用 `BestInferenceModelExporter` 后，`propeller` 会自动将指标最好的模型保存为paddle inference model格式，随后启动一个预测服务。Student模型在训练的同时，实时地访问这个服务来获得ERNIE的预测打分。只需执行
+```
+sh ./distill/script/distill_chnsenticorp_with_propeller_server.sh
+```
+即可完成上述流程。
+流程包含3步：1. finetune ERNIE模型。2. 取指标最好的ERNIE模型启动`propeller`服务。 3.在student模型的训练过程中访问服务获取teacher模型的标注。
+此流程涉及两个python文件: `example/finetune_classifier.py` 与 `distill/distill_chnsentocorp_with_propeller_server.py`  。其中第一步与离线蒸馏中的用法完全一样。
+第二步中使用
+```script
+python3 -m propeller.tools.start_server -p 8113 -m ${teacher_dir}/best/inference/ &
+```
+启动一个ernie预测服务
+第三步开始student模型的同步训练：
+```script
+python3 ./distill/distill_chnsentocorp_with_propeller_server.py \
+    --data_dir ${TASK_DATA_PATH}/distill/chnsenticorp/student \
+    --vocab_file ${TASK_DATA_PATH}/distill/chnsenticorp/student/vocab.txt \
+    --teacher_vocab_file ${MODEL_PATH}/vocab.txt \
+    --max_seqlen 128 \
+    --teacher_max_seqlen 128 \
+    --server_batch_size 64 \
+    --teacher_host tcp://localhost:8113 \
+    --num_coroutine 10
+```
+该脚本将`${TASK_DATA_PATH}/distill/chnsenticorp/student/unsup_train_aug` 目录下的增强数据进行切字并请求`propeller` 服务。`--num_coroutine` 指定了请求的并发数，`--teacher_host` 指定了服务的端口和IP，`--server_batch_size` 指定了请求的batch_size，在实际的请求中每个batch的数据会拆分成若干个 `--server_batch_size` 大小的数据去请求服务。
+# 效果验证
+我们将实际应用场景分类为两种：
+### Case#1 用户提供“无标注数据”<a name="case1"></a>
+|模型 | 评论低质识别【分类 \| ACC】 | 中文情感【分类 \| ACC】 |问题识别【分类 \| ACC】|搜索问答匹配【匹配 \| 正逆序】|
+|---|---|---|---|---|
+|ERNIE-Finetune | 90.6% | 96.2% | 97.5% | 4.25 |
+|非ERNIE基线（BOW）| 80.8% | 94.7% | 93.0% | 1.83 |
+|**+ 数据蒸馏** | 87.2% | 95.8% | 96.3% | 3.30 |
+### Case#2 用户未提供“无标注数据”（通过数据增强生成数据）<a name="case2"></a>
+|模型 |ChnSentiCorp |
+|---|---|
+|ERNIE-Finetune |95.4% |
+|非ERNIE基线(BOW)|90.1%|
+|**+ 数据蒸馏** |91.4%| 
+|非ERNIE基线（LSTM）|91.2%|
+|**+ 数据蒸馏**|93.9%|
+# FAQ
+### FQA1: 预测同时蒸馏报错：`Client call failed`
+终端打印的错误是client的日志，server端的日志在前面。一般来说可能是server显存超限导致。这种时候需要在student模型finetune的脚本中使用`--server_batch_size ` 显示控制请求服务的batch大小。
--- a/distill/distill_chnsentocorp.py
+++ b/distill/distill_chnsentocorp.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import re
+import time
+from random import random
+from functools import reduce, partial
+import logging
+import numpy as np
+import multiprocessing
+import paddle
+import paddle.fluid as F
+import paddle.fluid.layers as L
+from propeller import log
+import propeller.paddle as propeller
+from propeller.paddle.data import Dataset
+from optimization import optimization
+import utils.data
+log.setLevel(logging.DEBUG)
+class ClassificationBowModel(propeller.train.Model):
+    """propeller Model wraper for paddle-ERNIE """
+    def __init__(self, config, mode, run_config):
+        self.config = config
+        self.mode = mode
+        self.run_config = run_config
+        self._param_initializer = F.initializer.TruncatedNormal(
+            scale=config.initializer_range)
+        self._emb_dtype = "float32"
+        self._word_emb_name = "word_embedding"
+    def forward(self, features):
+        text_ids_a, = features
+        def bow(ids):
+            embed = L.embedding(
+                input=ids,
+                size=[self.config.vocab_size, self.config.emb_size],
+                dtype=self._emb_dtype,
+                param_attr=F.ParamAttr(
+                    name=self._word_emb_name, initializer=self._param_initializer),
+                is_sparse=False)
+            zero = L.fill_constant(shape=[1], dtype='int64', value=0)
+            pad = L.cast(L.logical_not(L.equal(ids, zero)), 'float32')
+            sumed = L.reduce_sum(embed * pad, dim=1)
+            sumed = L.softsign(sumed)
+            return sumed
+        sumed = bow(text_ids_a)
+        fced = L.fc(
+            input=sumed,
+            size=self.config.emb_size,
+            act='tanh',
+            param_attr=F.ParamAttr(
+                name="middle_fc.w_0", initializer=self._param_initializer),
+            bias_attr="middle_fc.b_0")
+        logits = L.fc(
+            input=fced,
+            size=self.config.num_label,
+            act=None,
+            param_attr=F.ParamAttr(
+                name="pooler_fc.w_0", initializer=self._param_initializer),
+            bias_attr="pooler_fc.b_0")
+        if self.mode is propeller.RunMode.PREDICT:
+            probs = L.softmax(logits)
+            return probs
+        else:
+            return logits
+    def loss(self, predictions, labels):
+        labels = L.softmax(labels)
+        loss = L.softmax_with_cross_entropy(predictions, labels, soft_label=True)
+        loss = L.mean(loss)
+        return loss
+    def backward(self, loss):
+        scheduled_lr, _ = optimization(
+            loss=loss,
+            warmup_steps=int(self.run_config.max_steps * self.config.warmup_proportion),
+            num_train_steps=self.run_config.max_steps,
+            learning_rate=self.config.learning_rate,
+            train_program=F.default_main_program(), 
+            startup_prog=F.default_startup_program(),
+            weight_decay=self.config.weight_decay,
+            scheduler="linear_warmup_decay",)
+        propeller.summary.scalar('lr', scheduled_lr)
+    def metrics(self, predictions, labels):
+        predictions = L.argmax(predictions, axis=1)
+        labels = L.argmax(labels, axis=1)
+        #predictions = L.unsqueeze(predictions, axes=[1])
+        acc = propeller.metrics.Acc(labels, predictions)
+        #auc = propeller.metrics.Auc(labels, predictions)
+        return {'acc': acc}
+if __name__ == '__main__':
+    parser = propeller.ArgumentParser('Distill model with Paddle')
+    parser.add_argument('--max_seqlen', type=int, default=128)
+    parser.add_argument('--vocab_file', type=str, required=True)
+    parser.add_argument('--unsupervise_data_dir', type=str, required=True)
+    parser.add_argument('--data_dir', type=str)
+    args = parser.parse_args()
+    run_config = propeller.parse_runconfig(args)
+    hparams = propeller.parse_hparam(args)
+    vocab = {j.strip().split(b'\t')[0].decode('utf8'): i for i, j in enumerate(open(args.vocab_file, 'rb'))}
+    unk_id = vocab['[UNK]']
+    char_tokenizer = utils.data.CharTokenizer(vocab.keys())
+    space_tokenizer = utils.data.SpaceTokenizer(vocab.keys())
+    supervise_feature_column = propeller.data.FeatureColumns([
+        propeller.data.TextColumn('text_a', unk_id=unk_id, vocab_dict=vocab, tokenizer=space_tokenizer),
+        propeller.data.LabelColumn('label'),
+    ])
+    def before(text_a, label):
+        sentence_a = text_a[: args.max_seqlen]
+        return sentence_a, label
+    def after(sentence_a, label):
+        batch_size = sentence_a.shape[0]
+        onehot_label = np.zeros([batch_size, hparams.num_label], dtype=np.float32)
+        onehot_label[np.arange(batch_size), label] = 9999.
+        sentence_a, = utils.data.expand_dims(sentence_a)
+        return sentence_a, onehot_label
+    train_ds = supervise_feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=True, repeat=True, use_gz=False) \
+                                   .map(before) \
+                                   .padded_batch(hparams.batch_size, (0, 0)) \
+                                   .map(after) \
+    unsup_train_ds = supervise_feature_column.build_dataset('unsup_train', data_dir=args.unsupervise_data_dir, shuffle=True, repeat=True, use_gz=False) \
+                                   .map(before) \
+                                   .padded_batch(hparams.batch_size, (0, 0)) \
+                                   .map(after) 
+    dev_ds = supervise_feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \
+                                   .map(before) \
+                                   .padded_batch(hparams.batch_size, (0, 0)) \
+                                   .map(after)
+    train_ds = utils.data.interleave(train_ds, unsup_train_ds)
+    shapes = ([-1, args.max_seqlen, 1], [-1, hparams.num_label]) 
+    types = ('int64', 'float32')
+    train_ds.data_shapes = shapes
+    train_ds.data_types = types
+    dev_ds.data_shapes = shapes
+    dev_ds.data_types = types
+    '''
+    from tqdm import tqdm
+    for slots in tqdm(train_ds):
+        pass
+    '''
+    best_exporter = propeller.train.exporter.BestExporter(os.path.join(run_config.model_dir, 'best'), cmp_fn=lambda old, new: new['dev']['acc'] > old['dev']['acc'])
+    propeller.train.train_and_eval(
+            model_class_or_model_fn=ClassificationBowModel, 
+            params=hparams, 
+            run_config=run_config, 
+            train_dataset=train_ds, 
+            eval_dataset={'dev': dev_ds}, 
+            exporters=[best_exporter])
+    print('dev_acc3\t%.5f' % (best_exporter._best['dev']['acc']))
--- a/distill/distill_chnsentocorp_with_propeller_server.py
+++ b/distill/distill_chnsentocorp_with_propeller_server.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import re
+import time
+from random import random
+from functools import reduce, partial
+import logging
+import numpy as np
+import multiprocessing
+import paddle
+import paddle.fluid as F
+import paddle.fluid.layers as L
+from propeller import log
+import propeller.paddle as propeller
+from propeller.paddle.data import Dataset
+from propeller.service.client import InferenceClient
+from optimization import optimization
+import utils.data
+log.setLevel(logging.DEBUG)
+class ClassificationBowModel(propeller.train.Model):
+    """propeller Model wraper for paddle-ERNIE """
+    def __init__(self, config, mode, run_config):
+        self.config = config
+        self.mode = mode
+        self.run_config = run_config
+        self._param_initializer = F.initializer.TruncatedNormal(
+            scale=config.initializer_range)
+        self._emb_dtype = "float32"
+        self._word_emb_name = "word_embedding"
+    def forward(self, features):
+        text_ids_a, = features
+        def bow(ids):
+            embed = L.embedding(
+                input=ids,
+                size=[self.config.vocab_size, self.config.emb_size],
+                dtype=self._emb_dtype,
+                param_attr=F.ParamAttr(
+                    name=self._word_emb_name, initializer=self._param_initializer),
+                is_sparse=False)
+            zero = L.fill_constant(shape=[1], dtype='int64', value=0)
+            pad = L.cast(L.logical_not(L.equal(ids, zero)), 'float32')
+            sumed = L.reduce_sum(embed * pad, dim=1)
+            sumed = L.softsign(sumed)
+            return sumed
+        sumed = bow(text_ids_a)
+        fced = L.fc(
+            input=sumed,
+            size=self.config.emb_size,
+            act='tanh',
+            param_attr=F.ParamAttr(
+                name="middle_fc.w_0", initializer=self._param_initializer),
+            bias_attr="middle_fc.b_0")
+        logits = L.fc(
+            input=fced,
+            size=self.config.num_label,
+            act=None,
+            param_attr=F.ParamAttr(
+                name="pooler_fc.w_0", initializer=self._param_initializer),
+            bias_attr="pooler_fc.b_0")
+        if self.mode is propeller.RunMode.PREDICT:
+            probs = L.softmax(logits)
+            return probs
+        else:
+            return logits
+    def loss(self, predictions, labels):
+        labels = L.softmax(labels)
+        loss = L.softmax_with_cross_entropy(predictions, labels, soft_label=True)
+        loss = L.mean(loss)
+        return loss
+    def backward(self, loss):
+        scheduled_lr, _ = optimization(
+            loss=loss,
+            warmup_steps=int(self.run_config.max_steps * self.config.warmup_proportion),
+            num_train_steps=self.run_config.max_steps,
+            learning_rate=self.config.learning_rate,
+            train_program=F.default_main_program(), 
+            startup_prog=F.default_startup_program(),
+            weight_decay=self.config.weight_decay,
+            scheduler="linear_warmup_decay",)
+        propeller.summary.scalar('lr', scheduled_lr)
+    def metrics(self, predictions, labels):
+        predictions = L.argmax(predictions, axis=1)
+        labels = L.argmax(labels, axis=1)
+        #predictions = L.unsqueeze(predictions, axes=[1])
+        acc = propeller.metrics.Acc(labels, predictions)
+        #auc = propeller.metrics.Auc(labels, predictions)
+        return {'acc': acc}
+if __name__ == '__main__':
+    parser = propeller.ArgumentParser('distill model with ERNIE')
+    parser.add_argument('--max_seqlen', type=int, default=128)
+    parser.add_argument('--vocab_file', type=str, required=True)
+    parser.add_argument('--teacher_vocab_file', type=str, required=True)
+    parser.add_argument('--teacher_max_seqlen', type=int, default=128)
+    parser.add_argument('--data_dir', type=str)
+    parser.add_argument('--server_batch_size', type=int, default=64)
+    parser.add_argument('--num_coroutine', type=int, default=1)
+    parser.add_argument('--teacher_host', type=str, required=True)
+    args = parser.parse_args()
+    run_config = propeller.parse_runconfig(args)
+    hparams = propeller.parse_hparam(args)
+    teacher_vocab = {j.strip().split(b'\t')[0].decode('utf8'): i for i, j in enumerate(open(args.teacher_vocab_file, 'rb'))}
+    vocab = {j.strip().split(b'\t')[0].decode('utf8'): i for i, j in enumerate(open(args.vocab_file, 'rb'))}
+    teacher_sep_id = teacher_vocab['[SEP]']
+    teacher_cls_id = teacher_vocab['[CLS]']
+    teacher_unk_id = teacher_vocab['[UNK]']
+    unk_id = vocab['[UNK]']
+    char_tokenizer = utils.data.CharTokenizer(vocab.keys())
+    space_tokenizer = utils.data.SpaceTokenizer(vocab.keys())
+    supervise_feature_column = propeller.data.FeatureColumns([
+        propeller.data.TextColumn('text_a', unk_id=unk_id, vocab_dict=vocab, tokenizer=space_tokenizer),
+        propeller.data.LabelColumn('label'),
+    ])
+    unsupervise_feature_column = propeller.data.FeatureColumns([
+        propeller.data.TextColumn('text_a', unk_id=unk_id, vocab_dict=vocab, tokenizer=space_tokenizer),
+        propeller.data.TextColumn('teacher_text_a', unk_id=teacher_unk_id, vocab_dict=teacher_vocab, tokenizer=char_tokenizer),
+    ])
+    def before(text_a, label):
+        sentence_a = text_a[: args.max_seqlen]
+        return sentence_a, label
+    def after(sentence_a, label):
+        batch_size = sentence_a.shape[0]
+        onehot_label = np.zeros([batch_size, hparams.num_label], dtype=np.float32)
+        onehot_label[np.arange(batch_size), label] = 9999.
+        sentence_a, = utils.data.expand_dims(sentence_a)
+        return sentence_a, onehot_label
+    train_ds = supervise_feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=True, repeat=True, use_gz=False) \
+                                   .map(before) \
+                                   .padded_batch(hparams.batch_size, (0, 0)) \
+                                   .map(after) \
+    dev_ds = supervise_feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \
+                                   .map(before) \
+                                   .padded_batch(hparams.batch_size, (0, 0)) \
+                                   .map(after)
+    def unsuperve_before(text_a, teacher_text_a):
+        teacher_sentence, teacher_segments = utils.data.build_1_pair(teacher_text_a, max_seqlen=args.teacher_max_seqlen, cls_id=teacher_cls_id, sep_id=teacher_sep_id)
+        sentence_a = text_a[: args.max_seqlen]
+        return sentence_a, teacher_sentence, teacher_segments
+    client = InferenceClient(args.teacher_host, batch_size=args.server_batch_size, num_coroutine=args.num_coroutine)
+    log.info('teacher host %s' % args.teacher_host)
+    def ask_teacher_for_label(sentence_a, teacher_sentence, teacher_segments):
+        sentence_a, teacher_sentence, teacher_segments = utils.data.expand_dims(sentence_a, teacher_sentence, teacher_segments)
+        teacher_label, = client(teacher_sentence, teacher_segments)
+        teacher_label = teacher_label[:, :]
+        return sentence_a, teacher_label
+    unsup_train_ds = unsupervise_feature_column.build_dataset('unsup_train', data_dir=os.path.join(args.data_dir, 'unsup_train_aug'), shuffle=True, repeat=True, use_gz=False) \
+                                   .buffered(100) \
+                                   .map(unsuperve_before) \
+                                   .padded_batch(hparams.batch_size, (0, 0, 0)) \
+                                   .map(ask_teacher_for_label) 
+    train_ds = utils.data.interleave(train_ds, unsup_train_ds)
+    shapes = ([-1, args.max_seqlen, 1], [-1, hparams.num_label]) 
+    types = ('int64', 'float32')
+    train_ds.data_shapes = shapes
+    train_ds.data_types = types
+    dev_ds.data_shapes = shapes
+    dev_ds.data_types = types
+    '''
+    from tqdm import tqdm
+    for slots in tqdm(train_ds):
+        pass
+    '''
+    best_exporter = propeller.train.exporter.BestExporter(os.path.join(run_config.model_dir, 'best'), cmp_fn=lambda old, new: new['dev']['acc'] > old['dev']['acc'])
+    propeller.train.train_and_eval(
+            model_class_or_model_fn=ClassificationBowModel, 
+            params=hparams, 
+            run_config=run_config, 
+            train_dataset=train_ds, 
+            eval_dataset={'dev': dev_ds}, 
+            exporters=[best_exporter])
+    print('dev_acc3\t%.5f' % (best_exporter._best['dev']['acc']))
--- a/distill/script/distill_chnsenticorp.sh
+++ b/distill/script/distill_chnsenticorp.sh
+set -x
+export PYTHONPATH=.:./ernie/:${PYTHONPATH:-}
+output_dir=./output/distill
+teacher_dir=${output_dir}/teacher
+student_dir=${output_dir}/student
+# 1. finetune teacher
+CUDA_VISIBLE_DEVICES=0 \
+python3 -u ./example/finetune_classifier.py \
+    --data_dir ${TASK_DATA_PATH}/distill/chnsenticorp/teacher \
+    --warm_start_from ${MODEL_PATH}/params \
+    --vocab_file ${MODEL_PATH}/vocab.txt \
+    --max_seqlen 128 \
+    --run_config '{
+        "model_dir": "'${teacher_dir}'",
+        "max_steps": '$((10 * 9600 / 32))',
+        "save_steps": 100,
+        "log_steps": 10,
+        "max_ckpt": 1,
+        "skip_steps": 0,
+        "eval_steps": 100
+    }' \
+    --hparam ${MODEL_PATH}/ernie_config.json \
+    --hparam '{ # model definition
+		"sent_type_vocab_size": None,    # default term in official config
+		"use_task_id": False,
+        "task_id": 0,
+	}' \
+    --hparam '{ # learn					    
+      "warmup_proportion":  0.1,
+      "weight_decay": 0.01,
+      "use_fp16": 0,
+      "learning_rate": 0.00005,
+      "num_label": 2,
+      "batch_size": 32 
+    }' 
+(($?!=0)) && echo "Something goes wrong at Step 1, please check" && exit -1
+# 2. start a prediction server
+export CUDA_VISIBLE_DEVICES=0
+cat ${TASK_DATA_PATH}/distill/chnsenticorp/student/unsup_train_aug/part.0 |awk -F"\t" '{print $2}' |python3 -u ./example/finetune_classifier.py \
+    --do_predict \
+    --data_dir ${TASK_DATA_PATH}/distill/chnsenticorp/teacher \
+    --warm_start_from ${MODEL_PATH}/params \
+    --vocab_file ${MODEL_PATH}/vocab.txt \
+    --max_seqlen 128 \
+    --run_config '{
+        "model_dir": "'${teacher_dir}'",
+        "log_steps": 10,
+    }' \
+    --hparam ${MODEL_PATH}/ernie_config.json \
+    --hparam '{ # model definition
+		"sent_type_vocab_size": None,    # default term in official config
+		"use_task_id": False,
+        "task_id": 0,
+	}' \
+    --hparam '{ # learn
+      "warmup_proportion":  0.1,
+      "weight_decay": 0.01,
+      "use_fp16": 0,
+      "learning_rate": 0.00005,
+      "num_label": 2,
+      "batch_size": 100 
+    }'  > prediction_label
+(($?!=0)) && echo "Something goes wrong at Step 2, please check" && exit -1
+mkdir prediction_output
+paste ${TASK_DATA_PATH}/distill/chnsenticorp/student/unsup_train_aug/part.0 prediction_label |awk -F"\t" '{print $2"\t"$3}' > prediction_output/part.0
+#. 3. learn from teacher
+export CUDA_VISIBLE_DEVICES=0 
+python3 ./distill/distill_chnsentocorp.py \
+    --data_dir ${TASK_DATA_PATH}/distill/chnsenticorp/student \
+    --vocab_file ${TASK_DATA_PATH}/distill/chnsenticorp/student/vocab.txt \
+    --unsupervise_data_dir ./prediction_output/ \
+    --max_seqlen 128 \
+    --run_config '{
+        "model_dir": "'${student_dir}'",
+        "max_steps": '$((100 * 9600 / 100))',
+        "save_steps": 1000,
+        "log_steps": 10,
+        "max_ckpt": 1,
+        "skip_steps": 0,
+        "eval_steps": 100
+    }' \
+    --hparam '{
+        "num_label": 2,
+        "vocab_size": 35000,
+        "emb_size": 128,
+        "initializer_range": 0.02,
+	}' \
+    --hparam '{  					     # lr shit
+      "warmup_proportion":  0.1,
+      "weight_decay": 0.00,
+      "learning_rate": 1e-4,
+      "batch_size": 100 
+    }' 
+(($?!=0)) && echo "Something goes wrong at Step 3, please check" && exit -1
--- a/distill/script/distill_chnsenticorp_with_propeller_server.sh
+++ b/distill/script/distill_chnsenticorp_with_propeller_server.sh
+set -x
+export PYTHONPATH=.:./ernie/:${PYTHONPATH:-}
+output_dir=./output/distill
+teacher_dir=${output_dir}/teacher
+student_dir=${output_dir}/student
+# 1. finetune teacher
+CUDA_VISIBLE_DEVICES=0 \
+python3 -u ./example/finetune_classifier.py  \
+    --data_dir ${TASK_DATA_PATH}/distill/chnsenticorp/teacher \
+    --warm_start_from ${MODEL_PATH}/params \
+    --vocab_file ${MODEL_PATH}/vocab.txt \
+    --max_seqlen 128 \
+    --run_config '{
+        "model_dir": "'${teacher_dir}'",
+        "max_steps": '$((10 * 9600 / 32))',
+        "save_steps": 100,
+        "log_steps": 10,
+        "max_ckpt": 1,
+        "skip_steps": 0,
+        "eval_steps": 100
+    }' \
+    --hparam ${MODEL_PATH}/ernie_config.json \
+    --hparam '{ # model definition
+		"sent_type_vocab_size": None,    # default term in official config
+		"use_task_id": False,
+        "task_id": 0,
+	}' \
+    --hparam '{ # learn
+      "warmup_proportion":  0.1,
+      "weight_decay": 0.01,
+      "use_fp16": 0,
+      "learning_rate": 0.00005,
+      "num_label": 2,
+      "batch_size": 32 
+    }' 
+(($?!=0)) && echo "Something goes wrong at Step 1, please check" && exit -1
+# 2. start a prediction server
+CUDA_VISIBLE_DEVICES=1 \
+python3 -m propeller.tools.start_server -p 8113 -m ${teacher_dir}/best/inference/ &
+echo $! > pid.server
+sleep 10
+#. 3. learn from teacher
+export CUDA_VISIBLE_DEVICES=0 
+python3 ./distill/distill_chnsentocorp_with_propeller_server.py \
+    --data_dir ${TASK_DATA_PATH}/distill/chnsenticorp/student \
+    --vocab_file ${TASK_DATA_PATH}/distill/chnsenticorp/student/vocab.txt \
+    --teacher_vocab_file ${MODEL_PATH}/vocab.txt \
+    --max_seqlen 128 \
+    --teacher_max_seqlen 128 \
+    --server_batch_size 64 \
+    --teacher_host tcp://localhost:8113 \
+    --num_coroutine 10 \
+    --run_config '{
+        "model_dir": "'${student_dir}'",
+        "max_steps": '$((100 * 9600 / 100))',
+        "save_steps": 1000,
+        "log_steps": 10,
+        "max_ckpt": 1,
+        "skip_steps": 0,
+        "eval_steps": 100
+    }' \
+    --hparam '{ # model definition
+        "num_label": 2,
+        "vocab_size": 35000,
+        "emb_size": 128,
+        "initializer_range": 0.02,
+	}' \
+    --hparam '{ # learn  					    
+      "warmup_proportion":  0.1,
+      "weight_decay": 0.00,
+      "learning_rate": 1e-4,
+      "batch_size": 100 
+    }' 
+(($?!=0)) && echo "Something goes wrong at Step 2, please check" && exit -1
+ps -ef|grep 'propeller.tools.start_server' |awk '{print $2}'|xargs kill -9
--- a/_ce.py
+++ b/_ce.py
--- a/batching.py
+++ b/batching.py
@@ -208,7 +208,7 @@ def pad_batch_data(insts,
    if return_seq_lens:
        seq_lens = np.array([len(inst) for inst in insts])
-        return_list += [seq_lens.astype("int64").reshape([-1, 1])]
+        return_list += [seq_lens.astype("int64").reshape([-1])]
    return return_list if len(return_list) > 1 else return_list[0]

--- a/ernie_encoder.py
+++ b/ernie_encoder.py
@@ -22,13 +22,16 @@ import argparse
 import numpy as np
 import multiprocessing
+import logging
 import paddle.fluid as fluid
 import reader.task_reader as task_reader
-from model.ernie import ErnieConfig, ErnieModel
+from model.ernie_v1 import ErnieConfig, ErnieModel
-from utils.args import ArgumentGroup, print_arguments
+from utils.args import ArgumentGroup, print_arguments, prepare_logger
 from utils.init import init_pretraining_params
+log = logging.getLogger()
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
 model_g = ArgumentGroup(parser, "model", "model configuration and paths.")
@@ -52,24 +55,21 @@ run_type_g.add_arg("use_cuda",                     bool,   True,  "If set, use G
 def create_model(args, pyreader_name, ernie_config):
-    pyreader = fluid.layers.py_reader(
+    src_ids = fluid.layers.data(name='1', shape=[-1, args.max_seq_len, 1], dtype='int64')
-        capacity=50,
+    sent_ids = fluid.layers.data(name='2', shape=[-1, args.max_seq_len, 1], dtype='int64')
-        shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
+    pos_ids = fluid.layers.data(name='3', shape=[-1, args.max_seq_len, 1], dtype='int64')
-                [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
+    task_ids = fluid.layers.data(name='4', shape=[-1, args.max_seq_len, 1], dtype='int64')
-                [-1, args.max_seq_len, 1], [-1, 1]],
+    input_mask = fluid.layers.data(name='5', shape=[-1, args.max_seq_len, 1], dtype='float32')
-        dtypes=['int64', 'int64', 'int64', 'int64', 'float', 'int64'],
+    seq_lens = fluid.layers.data(name='8', shape=[-1], dtype='int64')
-        lod_levels=[0, 0, 0, 0, 0, 0],
-        name=pyreader_name,
+    pyreader = fluid.io.DataLoader.from_generator(feed_list=[src_ids, sent_ids, pos_ids, task_ids, input_mask, seq_lens], 
-        use_double_buffer=True)
+            capacity=70,
+            iterable=False)
-    (src_ids, sent_ids, pos_ids, task_ids, input_mask,
-     seq_lens) = fluid.layers.read_file(pyreader)
    ernie = ErnieModel(
        src_ids=src_ids,
        position_ids=pos_ids,
        sentence_ids=sent_ids,
-        task_ids=task_ids,
        input_mask=input_mask,
        config=ernie_config)
@@ -129,8 +129,6 @@ def main(args):
            pyreader, graph_vars = create_model(
                args, pyreader_name='reader', ernie_config=ernie_config)
-            fluid.memory_optimize(input_program=infer_program)
    infer_program = infer_program.clone(for_test=True)
    exe.run(startup_prog)
@@ -145,7 +143,7 @@ def main(args):
    exec_strategy = fluid.ExecutionStrategy()
    exec_strategy.num_threads = dev_count
-    pyreader.decorate_tensor_provider(data_generator)
+    pyreader.set_batch_generator(data_generator)
    pyreader.start()
    total_cls_emb = []
@@ -169,15 +167,21 @@ def main(args):
    total_cls_emb = np.concatenate(total_cls_emb)
    total_top_layer_emb = np.concatenate(total_top_layer_emb)
+    if not os.path.exists(args.output_dir):
+        os.mkdir(args.output_dir)
+    else:
+        raise RuntimeError('output dir exists: %s' % args.output_dir)
    with open(os.path.join(args.output_dir, "cls_emb.npy"),
-              "w") as cls_emb_file:
+              "wb") as cls_emb_file:
        np.save(cls_emb_file, total_cls_emb)
    with open(os.path.join(args.output_dir, "top_layer_emb.npy"),
-              "w") as top_layer_emb_file:
+              "wb") as top_layer_emb_file:
        np.save(top_layer_emb_file, total_top_layer_emb)
 if __name__ == '__main__':
+    prepare_logger(log)
    args = parser.parse_args()
    print_arguments(args)

--- a/__init__.py
+++ b/__init__.py
--- a/finetune/classifier.py
+++ b/finetune/classifier.py
@@ -16,8 +16,11 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
 import time
+import logging
 import numpy as np
 from scipy.stats import pearsonr, spearmanr
@@ -26,6 +29,7 @@ import paddle.fluid as fluid
 from model.ernie import ErnieModel
+log = logging.getLogger(__name__)
 def create_model(args,
                 pyreader_name,
@@ -35,34 +39,22 @@ def create_model(args,
                 is_classify=False,
                 is_regression=False,
                 ernie_version="1.0"):
+    src_ids = fluid.layers.data(name='eval_placeholder_0', shape=[-1, args.max_seq_len, 1], dtype='int64')
+    sent_ids = fluid.layers.data(name='eval_placeholder_1', shape=[-1, args.max_seq_len, 1], dtype='int64')
+    pos_ids = fluid.layers.data(name='eval_placeholder_2', shape=[-1, args.max_seq_len, 1], dtype='int64')
+    input_mask = fluid.layers.data(name='eval_placeholder_3', shape=[-1, args.max_seq_len, 1], dtype='float32')
+    task_ids = fluid.layers.data(name='eval_placeholder_4', shape=[-1, args.max_seq_len, 1], dtype='int64')
+    qids = fluid.layers.data(name='eval_placeholder_5', shape=[-1, 1], dtype='int64')
    if is_classify:
-        pyreader = fluid.layers.py_reader(
+        labels = fluid.layers.data(name='6', shape=[-1, 1], dtype='int64')
-            capacity=50,
-            shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
-                    [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
-                    [-1, args.max_seq_len, 1], [-1, 1], [-1, 1]],
-            dtypes=[
-                'int64', 'int64', 'int64', 'int64', 'float32', 'int64', 'int64'
-            ],
-            lod_levels=[0, 0, 0, 0, 0, 0, 0],
-            name=task_name + "_" + pyreader_name,
-            use_double_buffer=True)
    elif is_regression:
-        pyreader = fluid.layers.py_reader(
+        labels = fluid.layers.data(name='6', shape=[-1, 1], dtype='float32')
-            capacity=50,
-            shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
+    pyreader = fluid.io.DataLoader.from_generator(feed_list=[src_ids, sent_ids, pos_ids, task_ids, input_mask, labels, qids], 
-                    [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
+            capacity=70, 
-                    [-1, args.max_seq_len, 1], [-1, 1], [-1, 1]],
+            iterable=False)
-            dtypes=[
-                'int64', 'int64', 'int64', 'int64', 'float32', 'float32',
-                'int64'
-            ],
-            lod_levels=[0, 0, 0, 0, 0, 0, 0],
-            name=task_name + "_" + pyreader_name,
-            use_double_buffer=True)
-    (src_ids, sent_ids, pos_ids, task_ids, input_mask, labels,
-     qids) = fluid.layers.read_file(pyreader)
    ernie = ErnieModel(
        src_ids=src_ids,
@@ -88,8 +80,12 @@ def create_model(args,
            name=task_name + "_cls_out_b",
            initializer=fluid.initializer.Constant(0.)))
+    assert is_classify != is_regression, 'is_classify or is_regression must be true and only one of them can be true'
    if is_prediction:
+        if is_classify:
            probs = fluid.layers.softmax(logits)
+        else:
+            probs = logits
        feed_targets_name = [
            src_ids.name, sent_ids.name,  pos_ids.name, input_mask.name
        ]
@@ -97,7 +93,6 @@ def create_model(args,
            feed_targets_name += [task_ids.name]
        return pyreader, probs, feed_targets_name
-    assert is_classify != is_regression, 'is_classify or is_regression must be true and only one of them can be true'
    num_seqs = fluid.layers.create_tensor(dtype='int64')
    if is_classify:
        ce_loss, probs = fluid.layers.softmax_with_cross_entropy(

--- a/finetune/mrc.py
+++ b/finetune/mrc.py
@@ -16,12 +16,15 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
 import time
 import numpy as np
 import os
 import math
 import json
+import logging
 import collections
 import six
@@ -34,21 +37,21 @@ from model.ernie import ErnieModel
 import tokenization
+log = logging.getLogger(__name__)
 def create_model(args, pyreader_name, ernie_config, is_training):
-    pyreader = fluid.layers.py_reader(
+    src_ids = fluid.layers.data(name='1', shape=[-1, args.max_seq_len, 1], dtype='int64')
-        capacity=50,
+    pos_ids = fluid.layers.data(name='2', shape=[-1, args.max_seq_len, 1], dtype='int64')
-        shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
+    sent_ids= fluid.layers.data(name='3', shape=[-1, args.max_seq_len, 1], dtype='int64')
-                [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
+    task_ids= fluid.layers.data(name='4', shape=[-1, args.max_seq_len, 1], dtype='int64')
-                [-1, args.max_seq_len, 1], [-1, 1], [-1, 1], [-1, 1]],
+    input_mask = fluid.layers.data(name='5', shape=[-1, args.max_seq_len, 1], dtype='float32')
-        dtypes=[
+    start_positions = fluid.layers.data(name='6', shape=[-1, 1], dtype='int64')
-            'int64', 'int64', 'int64', 'int64', 'float32', 'int64', 'int64',
+    end_positions = fluid.layers.data(name='7', shape=[-1, 1], dtype='int64')
-            'int64'
+    unique_id = fluid.layers.data(name='8', shape=[-1, 1], dtype='int64')
-        ],
-        lod_levels=[0, 0, 0, 0, 0, 0, 0, 0],
+    pyreader = fluid.io.DataLoader.from_generator(feed_list=[
-        name=pyreader_name,
+        src_ids, sent_ids, pos_ids, task_ids, input_mask, start_positions,
-        use_double_buffer=True)
+        end_positions, unique_id], capacity=50, iterable=False)
-    (src_ids, sent_ids, pos_ids, task_ids, input_mask, start_positions,
-     end_positions, unique_id) = fluid.layers.read_file(pyreader)
    ernie = ErnieModel(
        src_ids=src_ids,
@@ -151,7 +154,7 @@ def evaluate(exe,
                program=test_program, fetch_list=fetch_list)
            for idx in range(np_unique_ids.shape[0]):
                if len(all_results) % 1000 == 0:
-                    print("Processing example: %d" % len(all_results))
+                    log.info("Processing example: %d" % len(all_results))
                unique_id = int(np_unique_ids[idx])
                start_logits = [float(x) for x in np_start_logits[idx].flat]
                end_logits = [float(x) for x in np_end_logits[idx].flat]
@@ -179,7 +182,7 @@ def evaluate(exe,
    time_end = time.time()
    elapsed_time = time_end - time_begin
-    print(
+    log.info(
        "[%s evaluation] em: %f, f1: %f, avg: %f, questions: %d, elapsed time: %f"
        % (eval_phase, em, f1, avg, total, elapsed_time))
@@ -188,8 +191,8 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
                      max_answer_length, do_lower_case, output_prediction_file,
                      output_nbest_file):
    """Write final predictions to the json file and log-odds of null if needed."""
-    print("Writing predictions to: %s" % (output_prediction_file))
+    log.info("Writing predictions to: %s" % (output_prediction_file))
-    print("Writing nbest to: %s" % (output_nbest_file))
+    log.info("Writing nbest to: %s" % (output_nbest_file))
    example_index_to_features = collections.defaultdict(list)
    for feature in all_features:

--- a/finetune/sequence_label.py
+++ b/finetune/sequence_label.py
@@ -15,6 +15,9 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
 import os
 import time
@@ -23,28 +26,27 @@ import numpy as np
 import multiprocessing
 import paddle
+import logging
 import paddle.fluid as fluid
 from six.moves import xrange
 from model.ernie import ErnieModel
+log = logging.getLogger(__name__)
 def create_model(args, pyreader_name, ernie_config, is_prediction=False):
-    pyreader = fluid.layers.py_reader(
+    src_ids = fluid.layers.data(name='1', shape=[-1, args.max_seq_len, 1], dtype='int64')
-        capacity=50,
+    sent_ids = fluid.layers.data(name='2', shape=[-1, args.max_seq_len, 1], dtype='int64')
-        shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
+    pos_ids = fluid.layers.data(name='3', shape=[-1, args.max_seq_len, 1], dtype='int64')
-                [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
+    task_ids = fluid.layers.data(name='4', shape=[-1, args.max_seq_len, 1], dtype='int64')
-                [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1]],
+    input_mask = fluid.layers.data(name='5', shape=[-1, args.max_seq_len, 1], dtype='float32')
-        dtypes=[
+    labels = fluid.layers.data(name='7', shape=[-1, args.max_seq_len, 1], dtype='int64')
-            'int64', 'int64', 'int64', 'int64', 'float32', 'int64', 'int64'
+    seq_lens = fluid.layers.data(name='8', shape=[-1], dtype='int64')
-        ],
-        lod_levels=[0, 0, 0, 0, 0, 0, 0],
+    pyreader = fluid.io.DataLoader.from_generator(feed_list=[src_ids, sent_ids, pos_ids, task_ids, input_mask, labels, seq_lens], 
-        name=pyreader_name,
+            capacity=70,
-        use_double_buffer=True)
+            iterable=False)
-    (src_ids, sent_ids, pos_ids, task_ids, input_mask, labels,
-     seq_lens) = fluid.layers.read_file(pyreader)
    ernie = ErnieModel(
        src_ids=src_ids,
@@ -70,9 +72,7 @@ def create_model(args, pyreader_name, ernie_config, is_prediction=False):
            initializer=fluid.initializer.Constant(0.)))
    infers = fluid.layers.argmax(logits, axis=2)
-    ret_labels = fluid.layers.reshape(x=labels, shape=[-1, 1])
    ret_infers = fluid.layers.reshape(x=infers, shape=[-1, 1])
    lod_labels = fluid.layers.sequence_unpad(labels, seq_lens)
    lod_infers = fluid.layers.sequence_unpad(infers, seq_lens)
@@ -92,18 +92,14 @@ def create_model(args, pyreader_name, ernie_config, is_prediction=False):
    ce_loss = ce_loss * input_mask
    loss = fluid.layers.mean(x=ce_loss)
-    if args.use_fp16 and args.loss_scaling > 1.0:
-        loss *= args.loss_scaling
    graph_vars = {
+        "inputs": src_ids,
        "loss": loss,
        "probs": probs,
-        "labels": ret_labels,
+        "seqlen": seq_lens,
-        "infers": ret_infers,
        "num_infer": num_infer,
        "num_label": num_label,
        "num_correct": num_correct,
-        "seq_lens": seq_lens
    }
    for k, v in graph_vars.items():
@@ -112,92 +108,12 @@ def create_model(args, pyreader_name, ernie_config, is_prediction=False):
    return pyreader, graph_vars
-def chunk_eval(np_labels, np_infers, np_lens, tag_num, dev_count=1):
+def calculate_f1(num_label, num_infer, num_correct):
-    def extract_bio_chunk(seq):
-        chunks = []
-        cur_chunk = None
-        null_index = tag_num - 1
-        for index in xrange(len(seq)):
-            tag = seq[index]
-            tag_type = tag // 2
-            tag_pos = tag % 2
-            if tag == null_index:
-                if cur_chunk is not None:
-                    chunks.append(cur_chunk)
-                    cur_chunk = None
-                continue
-            if tag_pos == 0:
-                if cur_chunk is not None:
-                    chunks.append(cur_chunk)
-                    cur_chunk = {}
-                cur_chunk = {"st": index, "en": index + 1, "type": tag_type}
-            else:
-                if cur_chunk is None:
-                    cur_chunk = {"st": index, "en": index + 1, "type": tag_type}
-                    continue
-                if cur_chunk["type"] == tag_type:
-                    cur_chunk["en"] = index + 1
-                else:
-                    chunks.append(cur_chunk)
-                    cur_chunk = {"st": index, "en": index + 1, "type": tag_type}
-        if cur_chunk is not None:
-            chunks.append(cur_chunk)
-        return chunks
-    null_index = tag_num - 1
-    num_label = 0
-    num_infer = 0
-    num_correct = 0
-    labels = np_labels.reshape([-1]).astype(np.int32).tolist()
-    infers = np_infers.reshape([-1]).astype(np.int32).tolist()
-    all_lens = np_lens.reshape([dev_count, -1]).astype(np.int32).tolist()
-    base_index = 0
-    for dev_index in xrange(dev_count):
-        lens = all_lens[dev_index]
-        max_len = 0
-        for l in lens:
-            max_len = max(max_len, l)
-        for i in xrange(len(lens)):
-            seq_st = base_index + i * max_len + 1
-            seq_en = seq_st + (lens[i] - 2)
-            infer_chunks = extract_bio_chunk(infers[seq_st:seq_en])
-            label_chunks = extract_bio_chunk(labels[seq_st:seq_en])
-            num_infer += len(infer_chunks)
-            num_label += len(label_chunks)
-            infer_index = 0
-            label_index = 0
-            while label_index < len(label_chunks) \
-                   and infer_index < len(infer_chunks):
-                if infer_chunks[infer_index]["st"] \
-                    < label_chunks[label_index]["st"]:
-                    infer_index += 1
-                elif infer_chunks[infer_index]["st"] \
-                    > label_chunks[label_index]["st"]:
-                    label_index += 1
-                else:
-                    if infer_chunks[infer_index]["en"] \
-                        == label_chunks[label_index]["en"] \
-                        and infer_chunks[infer_index]["type"] \
-                        == label_chunks[label_index]["type"]:
-                        num_correct += 1
-                    infer_index += 1
-                    label_index += 1
-        base_index += max_len * len(lens)
-    return num_label, num_infer, num_correct
+    num_infer = np.sum(num_infer)
+    num_label = np.sum(num_label)
+    num_correct = np.sum(num_correct)
-def calculate_f1(num_label, num_infer, num_correct):
    if num_infer == 0:
        precision = 0.0
    else:
@@ -220,34 +136,12 @@ def evaluate(exe,
             pyreader,
             graph_vars,
             tag_num,
-             eval_phase,
             dev_count=1):
    fetch_list = [
        graph_vars["num_infer"].name, graph_vars["num_label"].name,
        graph_vars["num_correct"].name
    ]
-    if eval_phase == "train":
-        fetch_list.append(graph_vars["loss"].name)
-        if "learning_rate" in graph_vars:
-            fetch_list.append(graph_vars["learning_rate"].name)
-        outputs = exe.run(fetch_list=fetch_list)
-        np_num_infer, np_num_label, np_num_correct, np_loss = outputs[:4]
-        num_label = np.sum(np_num_label)
-        num_infer = np.sum(np_num_infer)
-        num_correct = np.sum(np_num_correct)
-        precision, recall, f1 = calculate_f1(num_label, num_infer, num_correct)
-        rets = {
-            "precision": precision,
-            "recall": recall,
-            "f1": f1,
-            "loss": np.mean(np_loss)
-        }
-        if "learning_rate" in graph_vars:
-            rets["lr"] = float(outputs[4][0])
-        return rets
-    else:
    total_label, total_infer, total_correct = 0.0, 0.0, 0.0
    time_begin = time.time()
    pyreader.start()
@@ -266,7 +160,59 @@ def evaluate(exe,
    precision, recall, f1 = calculate_f1(total_label, total_infer,
                                         total_correct)
    time_end = time.time()
+    return  \
+        "[evaluation] f1: %f, precision: %f, recall: %f, elapsed time: %f s" \
+        % (f1, precision, recall, time_end - time_begin)
+def chunk_predict(np_inputs, np_probs, np_lens, dev_count=1):
+    inputs = np_inputs.reshape([-1]).astype(np.int32)
+    probs = np_probs.reshape([-1, np_probs.shape[-1]])
+    all_lens = np_lens.reshape([dev_count, -1]).astype(np.int32).tolist()
+    base_index = 0
+    out = []
+    for dev_index in xrange(dev_count):
+        lens = all_lens[dev_index]
+        max_len = 0
+        for l in lens:
+            max_len = max(max_len, l)
+        for i in xrange(len(lens)):
+            seq_st = base_index + i * max_len + 1
+            seq_en = seq_st + (lens[i] - 2)
+            prob = probs[seq_st:seq_en, :]
+            infers = np.argmax(prob, -1)
+            out.append((
+                    inputs[seq_st:seq_en].tolist(), 
+                    infers.tolist(),
+                    prob.tolist()))
+        base_index += max_len * len(lens)
+    return out
+def predict(exe,
+            test_program,
+            test_pyreader,
+            graph_vars,
+            dev_count=1):
+    fetch_list = [
+        graph_vars["inputs"].name,
+        graph_vars["probs"].name,
+        graph_vars["seqlen"].name,
+    ]
+    test_pyreader.start()
+    res = []
+    while True:
+        try:
+            inputs, probs, np_lens = exe.run(program=test_program,
+                                        fetch_list=fetch_list)
+            r = chunk_predict(inputs, probs, np_lens, dev_count)
+            res += r
+        except fluid.core.EOFException:
+            test_pyreader.reset()
+            break
+    return res
-        print(
-            "[%s evaluation] f1: %f, precision: %f, recall: %f, elapsed time: %f s"
-            % (eval_phase, f1, precision, recall, time_end - time_begin))
--- a/finetune_args.py
+++ b/finetune_args.py
@@ -11,10 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
 import os
 import time
@@ -47,10 +49,21 @@ train_g.add_arg("warmup_proportion", float,  0.1,
 train_g.add_arg("save_steps",        int,    10000,   "The steps interval to save checkpoints.")
 train_g.add_arg("validation_steps",  int,    1000,    "The steps interval to evaluate model performance.")
 train_g.add_arg("use_fp16",          bool,   False,   "Whether to use fp16 mixed precision training.")
-train_g.add_arg("loss_scaling",      float,  1.0,
+train_g.add_arg("use_dynamic_loss_scaling",    bool,   True,   "Whether to use dynamic loss scaling.")
+train_g.add_arg("init_loss_scaling",           float,  102400,
                "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.")
-train_g.add_arg("test_save",            str,    "test_result",       "test_save")
+train_g.add_arg("test_save",            str,    "./checkpoints/test_result",       "test_save")
 train_g.add_arg("metric",               str,    "simple_accuracy",   "metric")
+train_g.add_arg("incr_every_n_steps",          int,    100,   "Increases loss scaling every n consecutive.")
+train_g.add_arg("decr_every_n_nan_or_inf",     int,    2,
+                "Decreases loss scaling every n accumulated steps with nan or inf gradients.")
+train_g.add_arg("incr_ratio",                  float,  2.0,
+                "The multiplier to use when increasing the loss scaling.")
+train_g.add_arg("decr_ratio",                  float,  0.8,
+                "The less-than-one-multiplier to use when decreasing.")
 log_g = ArgumentGroup(parser,     "logging", "logging related.")
 log_g.add_arg("skip_steps",          int,    10,    "The steps interval to print loss.")
@@ -86,6 +99,7 @@ data_g.add_arg("chunk_scheme", type=str,  default="IOB", choices=["IO", "IOB", "
 run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
 run_type_g.add_arg("use_cuda",                     bool,   True,  "If set, use GPU for training.")
+run_type_g.add_arg("is_distributed",    bool,   False,  "If set, then start distributed training.")
 run_type_g.add_arg("use_fast_executor",            bool,   False, "If set, use fast parallel executor (in experiment).")
 run_type_g.add_arg("num_iteration_per_drop_scope", int,    10,    "Iteration intervals to drop scope.")
 run_type_g.add_arg("do_train",                     bool,   True,  "Whether to perform training.")

--- a/ernie/finetune_launch.py
+++ b/ernie/finetune_launch.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+import sys
+import subprocess
+import os
+import six
+import copy
+import argparse
+import time
+import logging
+from utils.args import ArgumentGroup, print_arguments, prepare_logger
+from finetune_args import parser as worker_parser
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+multip_g = ArgumentGroup(parser, "multiprocessing", 
+        "start paddle training using multi-processing mode.")
+multip_g.add_arg("node_ips", str, None, 
+        "paddle trainer ips")
+multip_g.add_arg("node_id", int, 0, 
+        "the trainer id of the node for multi-node distributed training.")
+multip_g.add_arg("print_config", bool, True, 
+        "print the config of multi-processing mode.")
+multip_g.add_arg("current_node_ip", str, None, 
+        "the ip of current node.")
+multip_g.add_arg("split_log_path", str, "log",
+        "log path for each trainer.")
+multip_g.add_arg("log_prefix", str, "",
+        "the prefix name of job log.")
+multip_g.add_arg("nproc_per_node", int, 8, 
+        "the number of process to use on each node.")
+multip_g.add_arg("selected_gpus", str, "0,1,2,3,4,5,6,7", 
+        "the gpus selected to use.")
+multip_g.add_arg("training_script", str, None, "the program/script to be lauched "
+        "in parallel followed by all the arguments", positional_arg=True)
+multip_g.add_arg("training_script_args", str, None,
+        "training script args", positional_arg=True, nargs=argparse.REMAINDER)
+# yapf: enable
+log = logging.getLogger()
+def start_procs(args):
+    procs = []
+    log_fns = []
+    default_env = os.environ.copy()
+    node_id = args.node_id
+    node_ips = [x.strip() for x in args.node_ips.split(',')]
+    current_ip = args.current_node_ip
+    if args.current_node_ip is None:
+        assert len(node_ips) == 1
+        current_ip = node_ips[0]
+        log.info(current_ip)
+    num_nodes = len(node_ips)
+    selected_gpus = [x.strip() for x in args.selected_gpus.split(',')]
+    selected_gpu_num = len(selected_gpus)
+    all_trainer_endpoints = ""
+    for ip in node_ips:
+        for i in range(args.nproc_per_node):
+            if all_trainer_endpoints != "":
+                all_trainer_endpoints += ","
+            all_trainer_endpoints += "%s:617%d" % (ip, i)
+    nranks = num_nodes * args.nproc_per_node
+    gpus_per_proc = args.nproc_per_node % selected_gpu_num 
+    if gpus_per_proc == 0:
+        gpus_per_proc =  selected_gpu_num // args.nproc_per_node
+    else:
+        gpus_per_proc =  selected_gpu_num // args.nproc_per_node + 1
+    selected_gpus_per_proc = [selected_gpus[i:i + gpus_per_proc] for i in range(0, len(selected_gpus), gpus_per_proc)]
+    if args.print_config:
+        log.info("all_trainer_endpoints: %s"
+              ", node_id: %s"
+              ", current_ip: %s"
+              ", num_nodes: %s"
+              ", node_ips: %s"
+              ", gpus_per_proc: %s"
+              ", selected_gpus_per_proc: %s"
+              ", nranks: %s" % (
+                all_trainer_endpoints, 
+                node_id,
+                current_ip,
+                num_nodes,
+                node_ips,
+                gpus_per_proc,
+                selected_gpus_per_proc,
+                nranks))
+    current_env = copy.copy(default_env)
+    procs = []
+    cmds = []
+    log_fns = []
+    for i in range(0, args.nproc_per_node):
+        trainer_id = node_id * args.nproc_per_node + i
+        assert current_ip is not None
+        current_env.update({
+            "FLAGS_selected_gpus": "%s" % ",".join([str(s) for s in selected_gpus_per_proc[i]]),
+            "PADDLE_TRAINER_ID" : "%d" % trainer_id,
+            "PADDLE_CURRENT_ENDPOINT": "%s:617%d" % (current_ip, i),
+            "PADDLE_TRAINERS_NUM": "%d" % nranks,
+            "PADDLE_TRAINER_ENDPOINTS": all_trainer_endpoints,
+            "PADDLE_NODES_NUM": "%d" % num_nodes
+        })
+        try:
+            idx = args.training_script_args.index('--is_distributed')
+            args.training_script_args[idx + 1] = 'true'
+        except ValueError:
+            args.training_script_args += ['--is_distributed', 'true']
+        cmd = [sys.executable, "-u",
+               args.training_script] + args.training_script_args
+        cmds.append(cmd)
+        if args.split_log_path:
+            logdir = "%s/%sjob.log.%d" % (args.split_log_path, args.log_prefix, trainer_id)
+            try:
+                os.mkdir(os.path.dirname(logdir))
+            except OSError:
+                pass
+            fn = open(logdir, "a")
+            log_fns.append(fn)
+            process = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn)
+            log.info('subprocess launched, check log at %s' % logdir)
+        else:
+            process = subprocess.Popen(cmd, env=current_env)
+            log.info('subprocess launched')
+        procs.append(process)
+    try:
+        for i in range(len(procs)):
+            proc = procs[i]
+            proc.wait()
+            if len(log_fns) > 0:
+                log_fns[i].close()
+            if proc.returncode != 0:    
+                raise subprocess.CalledProcessError(returncode=procs[i].returncode,
+                                                    cmd=cmds[i])
+            else:
+                log.info("proc %d finsh" % i)
+    except KeyboardInterrupt as e:
+        for p in procs:
+            log.info('killing %s' % p)
+            p.terminate()
+def main(args):
+    if args.print_config:
+        print_arguments(args)
+    start_procs(args)
+if __name__ == "__main__":
+    prepare_logger(log)
+    lanch_args = parser.parse_args()
+    finetuning_args = worker_parser.parse_args(
+            lanch_args.training_script_args)
+    init_path = finetuning_args.init_pretraining_params 
+    log.info("init model: %s" % init_path)
+    if not finetuning_args.use_fp16:
+        os.system('rename .master "" ' + init_path + '/*.master') 
+    main(lanch_args)
--- a/classify_infer.py
+++ b/classify_infer.py
@@ -11,16 +11,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Inference by """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
 import os
 import time
 import argparse
 import numpy as np
+import logging
 import multiprocessing
 # NOTE(paddle-dev): All of these flags should be
@@ -39,14 +41,14 @@ from reader.task_reader import ClassifyReader
 from model.ernie import ErnieConfig
 from finetune.classifier import create_model
-from utils.args import ArgumentGroup, print_arguments
+from utils.args import print_arguments, check_cuda, prepare_logger, ArgumentGroup
 from utils.init import init_pretraining_params
 from finetune_args import parser
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
 model_g = ArgumentGroup(parser, "model", "options to init, resume and save model.")
-model_g.add_arg("ernie_config_path",            str,  None,  "Path to the json file for bert model config.")
+model_g.add_arg("ernie_config_path",            str,  None,  "Path to the json file for ernie model config.")
 model_g.add_arg("init_checkpoint",              str,  None,  "Init checkpoint to resume training from.")
 model_g.add_arg("save_inference_model_path",    str,  "inference_model",  "If set, save the inference model to this path.")
 model_g.add_arg("use_fp16",                     bool, False, "Whether to resume parameters from fp16 checkpoint.")
@@ -66,6 +68,7 @@ run_type_g.add_arg("use_cuda",          bool,   True,  "If set, use GPU for trai
 run_type_g.add_arg("do_prediction",     bool,   True,  "Whether to do prediction on test set.")
 args = parser.parse_args()
+log = logging.getLogger()
 # yapf: enable.
 def main(args):
@@ -113,7 +116,7 @@ def main(args):
    _, ckpt_dir = os.path.split(args.init_checkpoint.rstrip('/'))
    dir_name = ckpt_dir + '_inference_model'
    model_path = os.path.join(args.save_inference_model_path, dir_name)
-    print("save inference model to %s" % model_path)
+    log.info("save inference model to %s" % model_path)
    fluid.io.save_inference_model(
        model_path,
        feed_target_names, [probs],
@@ -125,8 +128,12 @@ def main(args):
    #config = AnalysisConfig(os.path.join(model_path, "__model__"), os.path.join(model_path, ""))
    config = AnalysisConfig(model_path)
    if not args.use_cuda:
-        print("disable gpu")
+        log.info("disable gpu")
        config.disable_gpu()
+        config.switch_ir_optim(True) 
+    else:
+        log.info("using gpu")
+        config.enable_use_gpu(1024)
    # Create PaddlePredictor
    predictor = create_paddle_predictor(config)
@@ -137,7 +144,7 @@ def main(args):
        epoch=1,
        shuffle=False)
-    print("-------------- prediction results --------------")
+    log.info("-------------- prediction results --------------")
    np.set_printoptions(precision=4, suppress=True)
    index = 0
    total_time = 0
@@ -156,32 +163,20 @@ def main(args):
        # parse outputs
        output = outputs[0]
-        print(output.name)
+        batch_result  = output.as_ndarray()
-        output_data = output.data.float_data()
-        #assert len(output_data) == args.num_labels * args.batch_size
-        batch_result  = np.array(output_data).reshape((-1, args.num_labels))
        for single_example_probs in batch_result:
-            print("{} example\t{}".format(index, single_example_probs))
+            print('\t'.join(map(str, single_example_probs.tolist())))
            index += 1
-    print("qps:{}\ttotal_time:{}\ttotal_example:{}\tbatch_size:{}".format(index/total_time, total_time, index, args.batch_size))
+    log.info("qps:{}\ttotal_time:{}\ttotal_example:{}\tbatch_size:{}".format(index/total_time, total_time, index, args.batch_size))
 def array2tensor(ndarray):
    """ convert numpy array to PaddleTensor"""
    assert isinstance(ndarray, np.ndarray), "input type must be np.ndarray"
-    tensor = PaddleTensor()
+    tensor = PaddleTensor(data=ndarray)
-    tensor.name = "data"
-    tensor.shape = ndarray.shape
-    if "float" in str(ndarray.dtype):
-        tensor.dtype = PaddleDType.FLOAT32
-    elif "int" in str(ndarray.dtype):
-        tensor.dtype = PaddleDType.INT64
-    else:
-        raise ValueError("{} type ndarray is unsupported".format(tensor.dtype))
-    tensor.data = PaddleBuf(ndarray.flatten().tolist())
    return tensor
 if __name__ == '__main__':
+    prepare_logger(log)
    print_arguments(args)
    main(args)
--- a/finetune/__init__.py
+++ b/finetune/__init__.py
--- a/model/ernie.py
+++ b/model/ernie.py
@@ -16,14 +16,19 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
 import json
 import six
+import logging
 import paddle.fluid as fluid
+from io import open
+from paddle.fluid.layers import core
 from model.transformer_encoder import encoder, pre_process_layer
+log = logging.getLogger(__name__)
 class ErnieConfig(object):
    def __init__(self, config_path):
@@ -31,7 +36,7 @@ class ErnieConfig(object):
    def _parse(self, config_path):
        try:
-            with open(config_path) as json_file:
+            with open(config_path, 'r', encoding='utf8') as json_file:
                config_dict = json.load(json_file)
        except Exception:
            raise IOError("Error in parsing Ernie model config file '%s'" %
@@ -44,8 +49,8 @@ class ErnieConfig(object):
    def print_config(self):
        for arg, value in sorted(six.iteritems(self._config_dict)):
-            print('%s: %s' % (arg, value))
+            log.info('%s: %s' % (arg, value))
-        print('------------------------------------------------')
+        log.info('------------------------------------------------')
 class ErnieModel(object):
@@ -81,8 +86,8 @@ class ErnieModel(object):
        self._pos_emb_name = "pos_embedding"
        self._sent_emb_name = "sent_embedding"
        self._task_emb_name = "task_embedding"
-        self._dtype = "float16" if use_fp16 else "float32"
+        self._dtype = core.VarDesc.VarType.FP16 if use_fp16 else core.VarDesc.VarType.FP32
-        self._emb_dtype = "float32"
+        self._emb_dtype = core.VarDesc.VarType.FP32
        # Initialize all weigths by truncated normal initializer, and all biases
        # will be initialized by constant zero by default.
@@ -134,7 +139,7 @@ class ErnieModel(object):
        emb_out = pre_process_layer(
            emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder')
-        if self._dtype == "float16":
+        if self._dtype == core.VarDesc.VarType.FP16:
            emb_out = fluid.layers.cast(x=emb_out, dtype=self._dtype)
            input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype)
        self_attn_mask = fluid.layers.matmul(
@@ -163,6 +168,10 @@ class ErnieModel(object):
            postprocess_cmd="dan",
            param_initializer=self._param_initializer,
            name='encoder')
+        if self._dtype == core.VarDesc.VarType.FP16:
+            self._enc_out = fluid.layers.cast(
+                x=self._enc_out, dtype=self._emb_dtype)
    def get_sequence_output(self):
        return self._enc_out
@@ -171,9 +180,6 @@ class ErnieModel(object):
        """Get the first feature of each sequence for classification"""
        next_sent_feat = fluid.layers.slice(
            input=self._enc_out, axes=[1], starts=[0], ends=[1])
-        if self._dtype == "float16":
-            next_sent_feat = fluid.layers.cast(
-                x=next_sent_feat, dtype=self._emb_dtype)
        next_sent_feat = fluid.layers.fc(
            input=next_sent_feat,
            size=self._emb_size,
@@ -194,8 +200,6 @@ class ErnieModel(object):
            x=self._enc_out, shape=[-1, self._emb_size])
        # extract masked tokens' feature
        mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
-        if self._dtype == "float16":
-            mask_feat = fluid.layers.cast(x=mask_feat, dtype=self._emb_dtype)
        # transform: fc
        mask_trans_feat = fluid.layers.fc(

--- a/model/ernie_v1.py
+++ b/model/ernie_v1.py
@@ -16,14 +16,19 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
 import json
+import logging
 import six
 import paddle.fluid as fluid
+from io import open
+from paddle.fluid.layers import core
 from model.transformer_encoder import encoder, pre_process_layer
+log = logging.getLogger(__name__)
 class ErnieConfig(object):
    def __init__(self, config_path):
@@ -31,7 +36,7 @@ class ErnieConfig(object):
    def _parse(self, config_path):
        try:
-            with open(config_path) as json_file:
+            with open(config_path, 'r', encoding='utf8') as json_file:
                config_dict = json.load(json_file)
        except Exception:
            raise IOError("Error in parsing Ernie model config file '%s'" %
@@ -44,8 +49,8 @@ class ErnieConfig(object):
    def print_config(self):
        for arg, value in sorted(six.iteritems(self._config_dict)):
-            print('%s: %s' % (arg, value))
+            log.info('%s: %s' % (arg, value))
-        print('------------------------------------------------')
+        log.info('------------------------------------------------')
 class ErnieModel(object):
@@ -72,7 +77,7 @@ class ErnieModel(object):
        self._word_emb_name = "word_embedding"
        self._pos_emb_name = "pos_embedding"
        self._sent_emb_name = "sent_embedding"
-        self._dtype = "float16" if use_fp16 else "float32"
+        self._dtype = core.VarDesc.VarType.FP16 if use_fp16 else core.VarDesc.VarType.FP32
        # Initialize all weigths by truncated normal initializer, and all biases
        # will be initialized by constant zero by default.
@@ -110,7 +115,7 @@ class ErnieModel(object):
        emb_out = pre_process_layer(
            emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder')
-        if self._dtype == "float16":
+        if self._dtype == core.VarDesc.VarType.FP16:
            input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype)
        self_attn_mask = fluid.layers.matmul(
            x=input_mask, y=input_mask, transpose_y=True)

--- a/model/transformer_encoder.py
+++ b/model/transformer_encoder.py
--- a/optimization.py
+++ b/optimization.py
@@ -16,10 +16,13 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
 import numpy as np
 import paddle.fluid as fluid
-from utils.fp16 import create_master_params_grads, master_param_to_train_param
+from utils.fp16 import create_master_params_grads, master_param_to_train_param, apply_dynamic_loss_scaling
 def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps):

--- a/pretrain_args.py
+++ b/pretrain_args.py
@@ -42,8 +42,18 @@ train_g.add_arg("warmup_steps",      int,    5000,    "Total steps to perform wa
 train_g.add_arg("save_steps",        int,    10000,   "The steps interval to save checkpoints.")
 train_g.add_arg("validation_steps",  int,    1000,    "The steps interval to evaluate model performance.")
 train_g.add_arg("use_fp16",          bool,   False,   "Whether to use fp16 mixed precision training.")
-train_g.add_arg("loss_scaling",      float,  1.0,
+train_g.add_arg("use_dynamic_loss_scaling",    bool,   True,   "Whether to use dynamic loss scaling.")
+train_g.add_arg("init_loss_scaling",           float,  102400,
                "Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled.")
+train_g.add_arg("incr_every_n_steps",          int,    100,   "Increases loss scaling every n consecutive.")
+train_g.add_arg("decr_every_n_nan_or_inf",     int,    2,
+                "Decreases loss scaling every n accumulated steps with nan or inf gradients.")
+train_g.add_arg("incr_ratio",                  float,  2.0,
+                "The multiplier to use when increasing the loss scaling.")
+train_g.add_arg("decr_ratio",                  float,  0.8,
+                "The less-than-one-multiplier to use when decreasing.")
 log_g = ArgumentGroup(parser,     "logging", "logging related.")
 log_g.add_arg("skip_steps",          int,    10,    "The steps interval to print loss.")

--- a/ernie/pretrain_launch.py
+++ b/ernie/pretrain_launch.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+from __future__ import division
+import sys
+import subprocess
+import os
+import six
+import copy
+import argparse
+import time
+import logging
+from utils.args import ArgumentGroup, print_arguments, prepare_logger
+from pretrain_args import parser as worker_parser
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+multip_g = ArgumentGroup(parser, "multiprocessing", 
+        "start paddle training using multi-processing mode.")
+multip_g.add_arg("node_ips", str, None, 
+        "paddle trainer ips")
+multip_g.add_arg("node_id", int, 0, 
+        "the trainer id of the node for multi-node distributed training.")
+multip_g.add_arg("print_config", bool, True, 
+        "print the config of multi-processing mode.")
+multip_g.add_arg("current_node_ip", str, None, 
+        "the ip of current node.")
+multip_g.add_arg("split_log_path", str, "./log",
+        "log path for each trainer.")
+multip_g.add_arg("log_prefix", str, "",
+        "the prefix name of job log.")
+multip_g.add_arg("nproc_per_node", int, 8, 
+        "the number of process to use on each node.")
+multip_g.add_arg("selected_gpus", str, "0,1,2,3,4,5,6,7", 
+        "the gpus selected to use.")
+multip_g.add_arg("training_script", str, None, "the program/script to be lauched "
+        "in parallel followed by all the arguments", positional_arg=True)
+multip_g.add_arg("training_script_args", str, None,
+        "training script args", positional_arg=True, nargs=argparse.REMAINDER)
+# yapf: enable
+log = logging.getLogger()
+def start_procs(args):
+    procs = []
+    log_fns = []
+    default_env = os.environ.copy()
+    node_id = args.node_id
+    node_ips = [x.strip() for x in args.node_ips.split(',')]
+    current_ip = args.current_node_ip
+    if args.current_node_ip is None:
+        assert len(node_ips) == 1
+        current_ip = node_ips[0]
+        log.info(current_ip)
+    num_nodes = len(node_ips)
+    selected_gpus = [x.strip() for x in args.selected_gpus.split(',')]
+    selected_gpu_num = len(selected_gpus)
+    all_trainer_endpoints = ""
+    for ip in node_ips:
+        for i in range(args.nproc_per_node):
+            if all_trainer_endpoints != "":
+                all_trainer_endpoints += ","
+            all_trainer_endpoints += "%s:617%d" % (ip, i)
+    nranks = num_nodes * args.nproc_per_node
+    gpus_per_proc = args.nproc_per_node % selected_gpu_num 
+    if gpus_per_proc == 0:
+        gpus_per_proc =  selected_gpu_num // args.nproc_per_node
+    else:
+        gpus_per_proc =  selected_gpu_num // args.nproc_per_node + 1
+    log.info(gpus_per_proc)
+    selected_gpus_per_proc = [selected_gpus[i:i + gpus_per_proc] for i in range(0, len(selected_gpus), gpus_per_proc)]
+    if args.print_config:
+        log.info("all_trainer_endpoints: %s"
+              ", node_id: %s"
+              ", current_ip: %s"
+              ", num_nodes: %s"
+              ", node_ips: %s"
+              ", gpus_per_proc: %s"
+              ", selected_gpus_per_proc: %s"
+              ", nranks: %s" % (
+                all_trainer_endpoints, 
+                node_id,
+                current_ip,
+                num_nodes,
+                node_ips,
+                gpus_per_proc,
+                selected_gpus_per_proc,
+                nranks))
+    current_env = copy.copy(default_env)
+    procs = []
+    cmds = []
+    log_fns = []
+    for i in range(0, args.nproc_per_node):
+        trainer_id = node_id * args.nproc_per_node + i
+        current_env.update({
+            "FLAGS_selected_gpus": "%s" % ",".join([str(s) for s in selected_gpus_per_proc[i]]),
+            "PADDLE_TRAINER_ID" : "%d" % trainer_id,
+            "PADDLE_CURRENT_ENDPOINT": "%s:617%d" % (current_ip, i),
+            "PADDLE_TRAINERS_NUM": "%d" % nranks,
+            "PADDLE_TRAINER_ENDPOINTS": all_trainer_endpoints,
+            "PADDLE_NODES_NUM": "%d" % num_nodes
+        })
+        try:
+            idx = args.training_script_args.index('--is_distributed')
+            args.training_script_args[idx + 1] = 'true'
+        except ValueError:
+            args.training_script_args += ['--is_distributed', 'true']
+        cmd = [sys.executable, "-u",
+               args.training_script] + args.training_script_args
+        cmds.append(cmd)
+        if args.split_log_path:
+            logdir = "%s/%sjob.log.%d" % (args.split_log_path, args.log_prefix, trainer_id)
+            try:
+                os.mkdir(os.path.dirname(logdir))
+            except OSError:
+                pass
+            fn = open(logdir, "a")
+            log_fns.append(fn)
+            process = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn)
+            log.info('subprocess launched, check log at %s' % logdir)
+        else:
+            process = subprocess.Popen(cmd, env=current_env)
+            log.info('subprocess launched')
+        procs.append(process)
+    try:
+        for i in range(len(procs)):
+            proc = procs[i]
+            proc.wait()
+            if len(log_fns) > 0:
+                log_fns[i].close()
+            if proc.returncode != 0:    
+                raise subprocess.CalledProcessError(returncode=procs[i].returncode,
+                                                    cmd=cmds[i])
+            else:
+                log.info("proc %d finsh" % i)
+    except KeyboardInterrupt as e:
+        for p in procs:
+            log.info('killing %s' % p)
+            p.terminate()
+def main(args):
+    if args.print_config:
+        print_arguments(args)
+    start_procs(args)
+if __name__ == "__main__":
+    prepare_logger(log)
+    lanch_args = parser.parse_args()
+    pretraining_args = worker_parser.parse_args(
+            lanch_args.training_script_args)
+    init_path = pretraining_args.init_checkpoint
+    if init_path and not pretraining_args.use_fp16:
+        os.system('rename .master "" ' + init_path + '/*.master')
+    main(lanch_args)
--- a/model/__init__.py
+++ b/model/__init__.py
--- a/reader/pretraining.py
+++ b/reader/pretraining.py
@@ -11,9 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import absolute_import
-from __future__ import print_function
 from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
 import os
 import numpy as np
@@ -36,8 +38,10 @@ class ErnieDataReader(object):
                 filelist,
                 vocab_path,
                 batch_size=4096,
+                 in_tokens=True,
                 max_seq_len=512,
                 shuffle_files=True,
+                 random_seed=1,
                 epoch=100,
                 voc_size=0,
                 is_test=False,
@@ -46,6 +50,8 @@ class ErnieDataReader(object):
        self.vocab = self.load_vocab(vocab_path)
        self.filelist = filelist
        self.batch_size = batch_size
+        self.in_tokens = in_tokens
+        self.random_seed = random_seed
        self.shuffle_files = shuffle_files
        self.epoch = epoch
        self.current_epoch = 0
@@ -60,13 +66,43 @@ class ErnieDataReader(object):
        self.mask_id = self.vocab["[MASK]"]
        self.is_test = is_test
        self.generate_neg_sample = generate_neg_sample
-        assert self.batch_size > 100, "Current batch size means total token's number, \
-                                       it should not be set to too small number."
+        self.trainer_id = 0
+        self.trainer_nums = 1
+        self.files = open(filelist).readlines()
+        self.total_file = len(self.files)
        if self.is_test:
            self.epoch = 1
            self.shuffle_files = False
+        self.global_rng = np.random.RandomState(random_seed)
+        if self.shuffle_files:
+            if os.getenv("PADDLE_TRAINER_ID"):
+                self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+            if os.getenv("PADDLE_NODES_NUM"):
+                self.trainer_nums = int(os.getenv("PADDLE_TRAINERS_NUM"))
+            #renew total_file
+            self.total_file = len(self.files) // self.trainer_nums * self.trainer_nums
+            if len(self.files) < self.trainer_nums:
+                raise RuntimeError('not enouph train file to shard, file:%d num_trainer:%d' % (len(self.files), self.trainer_nums))
+            tmp_files = []
+            for each in range(epoch):
+                each_files = self.files 
+                self.global_rng.shuffle(each_files)
+                tmp_files += each_files
+            self.files = tmp_files
+            #renew epochs
+            self.epoch = len(self.files) // self.total_file * self.total_file
+        assert self.total_file > 0, \
+            "[Error] data_dir is empty or less than %d" % self.trainer_nums
+        if self.in_tokens:
+            assert self.batch_size > 100, "Current batch size means total token's number, \
+                                       it should not be set to too small number."
    def get_progress(self):
        """return current progress of traning data
        """
@@ -75,13 +111,16 @@ class ErnieDataReader(object):
    def parse_line(self, line, max_seq_len=512):
        """ parse one line to token_ids, sentence_ids, pos_ids, label
        """
-        line = line.strip().decode().split(";")
+        line = line.strip().split(";")
-        assert len(line) == 5, "One sample must have 5 fields!"
+        assert len(line) == 5, \
+                "One sample must have %d fields!" % 5
        (token_ids, sent_ids, pos_ids, seg_labels, label) = line
        token_ids = [int(token) for token in token_ids.split(" ")]
        sent_ids = [int(token) for token in sent_ids.split(" ")]
        pos_ids = [int(token) for token in pos_ids.split(" ")]
        seg_labels = [int(seg_label) for seg_label in seg_labels.split(" ")]
        assert len(token_ids) == len(sent_ids) == len(pos_ids) == len(
            seg_labels
        ), "[Must be true]len(token_ids) == len(sent_ids) == len(pos_ids) == len(seg_labels)"
@@ -94,6 +133,7 @@ class ErnieDataReader(object):
        assert file.endswith('.gz'), "[ERROR] %s is not a gzip file" % file
        with gzip.open(file, "rb") as f:
            for line in f:
+                line = line.decode('utf8')
                parsed_line = self.parse_line(
                    line, max_seq_len=self.max_seq_len)
                if parsed_line is None:
@@ -233,34 +273,62 @@ class ErnieDataReader(object):
                  (num_total_miss, pos_sample_num * 2,
                   num_total_miss / (pos_sample_num * 2)))
+    def shuffle_samples(self, sample_generator, buffer=1000):
+        samples = []
+        try:
+            while True:
+                while len(samples) < buffer:
+                    sample = next(sample_generator)
+                    samples.append(sample)
+                np.random.shuffle(samples)
+                for sample in samples:
+                    yield sample
+                samples = []
+        except StopIteration:
+            print("stopiteration: reach end of file")
+            if len(samples) == 0:
+                yield None
+            else:
+                np.random.shuffle(samples)
+                for sample in samples:
+                    yield sample
    def data_generator(self):
        """
        data_generator
        """
-        files = open(self.filelist).readlines()
-        self.total_file = len(files)
-        assert self.total_file > 0, "[Error] data_dir is empty"
        def wrapper():
            def reader():
                for epoch in range(self.epoch):
                    self.current_epoch = epoch + 1
+                    files = self.files
+                    #during training, data are sliced by trainers
                    if self.shuffle_files:
-                        np.random.shuffle(files)
+                        start = epoch * self.total_file
-                    for index, file in enumerate(files):
+                        end = start + self.total_file
-                        file, mask_word_prob = file.strip().split("\t")
+                        files = [file_ for index, file_ in enumerate(self.files[start:end]) \
+                            if index % self.trainer_nums == self.trainer_id]
+                    for index, file_ in enumerate(files):
+                        file_, mask_word_prob = file_.strip().split("\t")
                        mask_word = (np.random.random() < float(mask_word_prob))
-                        self.current_file_index = index + 1
+                        self.current_file_index = (index + 1) * self.trainer_nums
-                        self.current_file = file
+                        self.current_file = file_
                        if mask_word:
                            self.mask_type = "mask_word"
                        else:
                            self.mask_type = "mask_char"
-                        sample_generator = self.read_file(file)
+                        sample_generator = self.read_file(file_)
-                        if not self.is_test and self.generate_neg_sample:
+                        if not self.is_test:
+                            if self.generate_neg_sample:
                                sample_generator = self.mixin_negtive_samples(
                                    sample_generator)
+                            else:
+                                #shuffle buffered sample
+                                sample_generator = self.shuffle_samples(
+                                    sample_generator)
                        for sample in sample_generator:
                            if sample is None:
                                continue
@@ -272,7 +340,11 @@ class ErnieDataReader(object):
                for parsed_line in reader():
                    token_ids, sent_ids, pos_ids, label, seg_labels, mask_word = parsed_line
                    max_len = max(max_len, len(token_ids))
-                    if (len(batch) + 1) * max_len <= batch_size:
+                    if self.in_tokens:
+                        to_append = (len(batch) + 1) * max_len <= batch_size
+                    else:
+                        to_append = len(batch) < batch_size
+                    if to_append:
                        batch.append(parsed_line)
                        total_token_num += len(token_ids)
                    else:

--- a/reader/task_reader.py
+++ b/reader/task_reader.py
@@ -11,18 +11,41 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+import sys
 import os
-import csv
 import json
 import random
+import logging
 import numpy as np
+import six
+from io import open
 from collections import namedtuple
 import tokenization
 from batching import pad_batch_data
+log = logging.getLogger(__name__)
+if six.PY3:
+    import io
+    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
+def csv_reader(fd, delimiter='\t'):
+    def gen():
+        for i in fd:
+            yield i.rstrip('\n').split(delimiter)
+    return gen()
 class BaseReader(object):
    def __init__(self,
                 vocab_path,
@@ -58,7 +81,7 @@ class BaseReader(object):
        self.num_examples = 0
        if label_map_config:
-            with open(label_map_config) as f:
+            with open(label_map_config, encoding='utf8') as f:
                self.label_map = json.load(f)
        else:
            self.label_map = None
@@ -69,8 +92,8 @@ class BaseReader(object):
    def _read_tsv(self, input_file, quotechar=None):
        """Reads a tab separated value file."""
-        with open(input_file, "r") as f:
+        with open(input_file, 'r', encoding='utf8') as f:
-            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+            reader = csv_reader(f)
            headers = next(reader)
            Example = namedtuple('Example', headers)
@@ -242,15 +265,21 @@ class BaseReader(object):
                        for batch in all_dev_batches:
                            yield batch
                        all_dev_batches = []
+        def f():
-        return wrapper
+            try:
+                for i in wrapper():
+                    yield i
+            except Exception as e:
+                import traceback
+                traceback.print_exc()
+        return f
 class ClassifyReader(BaseReader):
    def _read_tsv(self, input_file, quotechar=None):
        """Reads a tab separated value file."""
-        with open(input_file, "r") as f:
+        with open(input_file, 'r', encoding='utf8') as f:
-            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+            reader = csv_reader(f)
            headers = next(reader)
            text_indices = [
                index for index, h in enumerate(headers) if h != "label"
@@ -472,7 +501,7 @@ class MRCReader(BaseReader):
    def _read_json(self, input_file, is_training):
        examples = []
-        with open(input_file, "r") as f:
+        with open(input_file, "r", encoding='utf8') as f:
            input_data = json.load(f)["data"]
            for entry in input_data:
                for paragraph in entry["paragraphs"]:
@@ -507,7 +536,7 @@ class MRCReader(BaseReader):
                            actual_text = " ".join(doc_tokens[start_pos:(end_pos
                                                                         + 1)])
                            if actual_text.find(orig_answer_text) == -1:
-                                print("Could not find answer: '%s' vs. '%s'",
+                                log.info("Could not find answer: '%s' vs. '%s'",
                                      actual_text, orig_answer_text)
                                continue
                        else:

--- a/run_classifier.py
+++ b/run_classifier.py
@@ -16,9 +16,12 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
 import os
 import time
+import logging
 import multiprocessing
 # NOTE(paddle-dev): All of these flags should be
@@ -32,12 +35,13 @@ import reader.task_reader as task_reader
 from model.ernie import ErnieConfig
 from finetune.classifier import create_model, evaluate, predict
 from optimization import optimization
-from utils.args import print_arguments, check_cuda
+from utils.args import print_arguments, check_cuda, prepare_logger
 from utils.init import init_pretraining_params, init_checkpoint
 from utils.cards import get_cards
 from finetune_args import parser
 args = parser.parse_args()
+log = logging.getLogger()
 def main(args):
@@ -45,8 +49,9 @@ def main(args):
    ernie_config.print_config()
    if args.use_cuda:
-        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
+        dev_list = fluid.cuda_places()
-        dev_count = fluid.core.get_cuda_device_count()
+        place = dev_list[0]
+        dev_count = len(dev_list)
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
@@ -75,8 +80,6 @@ def main(args):
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed
-    if args.predict_batch_size == None:
-        args.predict_batch_size = args.batch_size
    if args.do_train:
        train_data_generator = reader.data_generator(
            input_file=args.train_set,
@@ -89,16 +92,18 @@ def main(args):
        num_train_examples = reader.get_num_examples(args.train_set)
        if args.in_tokens:
+            if args.batch_size < args.max_seq_len:
+                raise ValueError('if in_tokens=True, batch_size should greater than max_sqelen, got batch_size:%d seqlen:%d' % (args.batch_size, args.max_seq_len))
            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // dev_count
        else:
            max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count
        warmup_steps = int(max_train_steps * args.warmup_proportion)
-        print("Device count: %d" % dev_count)
+        log.info("Device count: %d" % dev_count)
-        print("Num train examples: %d" % num_train_examples)
+        log.info("Num train examples: %d" % num_train_examples)
-        print("Max train steps: %d" % max_train_steps)
+        log.info("Max train steps: %d" % max_train_steps)
-        print("Num warmup steps: %d" % warmup_steps)
+        log.info("Num warmup steps: %d" % warmup_steps)
        train_program = fluid.Program()
        if args.random_seed is not None and args.enable_ce:
@@ -121,7 +126,13 @@ def main(args):
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
-                    use_fp16=args.use_fp16)
+		    use_fp16=args.use_fp16,
+		    use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
+		    init_loss_scaling=args.init_loss_scaling,
+		    incr_every_n_steps=args.incr_every_n_steps,
+		    decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
+		    incr_ratio=args.incr_ratio,
+		    decr_ratio=args.decr_ratio)
        if args.verbose:
            if args.in_tokens:
@@ -131,7 +142,7 @@ def main(args):
            else:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program, batch_size=args.batch_size)
-            print("Theoretical memory usage in training: %.3f - %.3f %s" %
+            log.info("Theoretical memory usage in training: %.3f - %.3f %s" %
                  (lower_mem, upper_mem, unit))
    if args.do_val or args.do_test:
@@ -148,11 +159,36 @@ def main(args):
        test_prog = test_prog.clone(for_test=True)
    nccl2_num_trainers = 1
    nccl2_trainer_id = 0
+    if args.is_distributed:
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
+        current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
+        worker_endpoints = worker_endpoints_env.split(",")
+        trainers_num = len(worker_endpoints)
+        log.info("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
+              trainer_id:{}".format(worker_endpoints, trainers_num,
+                                    current_endpoint, trainer_id))
+        # prepare nccl2 env.
+        config = fluid.DistributeTranspilerConfig()
+        config.mode = "nccl2"
+        t = fluid.DistributeTranspiler(config=config)
+        t.transpile(
+            trainer_id,
+            trainers=worker_endpoints_env,
+            current_endpoint=current_endpoint,
+            program=train_program if args.do_train else test_prog,
+            startup_program=startup_prog)
+        nccl2_num_trainers = trainers_num
+        nccl2_trainer_id = trainer_id
+    exe = fluid.Executor(place)
    exe.run(startup_prog)
    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
-            print(
+            log.warning(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
@@ -192,7 +228,7 @@ def main(args):
            num_trainers=nccl2_num_trainers,
            trainer_id=nccl2_trainer_id)
-        train_pyreader.decorate_tensor_provider(train_data_generator)
+        train_pyreader.set_batch_generator(train_data_generator)
    else:
        train_exe = None
@@ -236,14 +272,14 @@ def main(args):
                        verbose += "learning rate: %f" % (
                            outputs["learning_rate"]
                            if warmup_steps > 0 else args.learning_rate)
-                        print(verbose)
+                        log.info(verbose)
                    current_example, current_epoch = reader.get_train_progress()
                    time_end = time.time()
                    used_time = time_end - time_begin
                    if args.is_classify:
-                        print(
+                        log.info(
                            "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, "
                            "ave acc: %f, speed: %f steps/s" %
                            (current_epoch, current_example, num_train_examples,
@@ -252,7 +288,7 @@ def main(args):
                        ce_info.append(
                            [outputs["loss"], outputs["accuracy"], used_time])
                    if args.is_regression:
-                        print(
+                        log.info(
                            "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, "
                            " speed: %f steps/s" %
                            (current_epoch, current_example, num_train_examples,
@@ -260,6 +296,7 @@ def main(args):
                             args.skip_steps / used_time))
                    time_begin = time.time()
+                if nccl2_trainer_id == 0:
                    if steps % args.save_steps == 0:
                        save_path = os.path.join(args.checkpoints,
                                                 "step_" + str(steps))
@@ -295,10 +332,10 @@ def main(args):
                ce_acc = ce_info[-2][1]
                ce_time = ce_info[-2][2]
            except:
-                print("ce info error")
+                log.info("ce info error")
-            print("kpis\ttrain_duration_card%s\t%s" % (card_num, ce_time))
+            log.info("kpis\ttrain_duration_card%s\t%s" % (card_num, ce_time))
-            print("kpis\ttrain_loss_card%s\t%f" % (card_num, ce_loss))
+            log.info("kpis\ttrain_loss_card%s\t%f" % (card_num, ce_loss))
-            print("kpis\ttrain_acc_card%s\t%f" % (card_num, ce_acc))
+            log.info("kpis\ttrain_acc_card%s\t%f" % (card_num, ce_acc))
    # final eval on dev set
    if args.do_val:
@@ -312,7 +349,7 @@ def main(args):
    # final eval on dianostic, hack for glue-ax
    if args.diagnostic:
-        test_pyreader.decorate_tensor_provider(
+        test_pyreader.set_batch_generator(
            reader.data_generator(
                args.diagnostic,
                batch_size=args.batch_size,
@@ -320,7 +357,7 @@ def main(args):
                dev_count=1,
                shuffle=False))
-        print("Final diagnostic")
+        log.info("Final diagnostic")
        qids, preds, probs = predict(
            test_exe,
            test_prog,
@@ -334,22 +371,23 @@ def main(args):
            for id, s, p in zip(qids, preds, probs):
                f.write('{}\t{}\t{}\n'.format(id, s, p))
-        print("Done final diagnostic, saving to {}".format(
+        log.info("Done final diagnostic, saving to {}".format(
            args.diagnostic_save))
 def evaluate_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars,
                     epoch, steps):
    # evaluate dev set
+    batch_size = args.batch_size if args.predict_batch_size is None else args.predict_batch_size
    for ds in args.dev_set.split(','):
-        test_pyreader.decorate_tensor_provider(
+        test_pyreader.set_batch_generator(
            reader.data_generator(
                ds,
-                batch_size=args.predict_batch_size,
+                batch_size=batch_size,
                epoch=1,
                dev_count=1,
                shuffle=False))
-        print("validation result of dataset {}:".format(ds))
+        log.info("validation result of dataset {}:".format(ds))
        evaluate_info = evaluate(
            exe,
            test_prog,
@@ -359,7 +397,7 @@ def evaluate_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars,
            metric=args.metric,
            is_classify=args.is_classify,
            is_regression=args.is_regression)
-        print(evaluate_info + ', file: {}, epoch: {}, steps: {}'.format(
+        log.info(evaluate_info + ', file: {}, epoch: {}, steps: {}'.format(
            ds, epoch, steps))
@@ -368,18 +406,19 @@ def predict_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars,
    test_sets = args.test_set.split(',')
    save_dirs = args.test_save.split(',')
    assert len(test_sets) == len(save_dirs)
+    batch_size = args.batch_size if args.predict_batch_size is None else args.predict_batch_size
    for test_f, save_f in zip(test_sets, save_dirs):
-        test_pyreader.decorate_tensor_provider(
+        test_pyreader.set_batch_generator(
            reader.data_generator(
                test_f,
-                batch_size=args.predict_batch_size,
+                batch_size=batch_size,
                epoch=1,
                dev_count=1,
                shuffle=False))
        save_path = save_f + '.' + str(epoch) + '.' + str(steps)
-        print("testing {}, save to {}".format(test_f, save_path))
+        log.info("testing {}, save to {}".format(test_f, save_path))
        qids, preds, probs = predict(
            exe,
            test_prog,
@@ -393,11 +432,16 @@ def predict_wrapper(args, reader, exe, test_prog, test_pyreader, graph_vars,
            os.makedirs(save_dir)
        with open(save_path, 'w') as f:
+            if len(qids) == 0:
+                for s, p in zip(preds, probs):
+                    f.write('{}\t{}\n'.format(s, p))
+            else:
                for id, s, p in zip(qids, preds, probs):
                    f.write('{}\t{}\t{}\n'.format(id, s, p))
 if __name__ == '__main__':
+    prepare_logger(log)
    print_arguments(args)
    check_cuda(args.use_cuda)
    main(args)
--- a/run_mrc.py
+++ b/run_mrc.py
@@ -16,9 +16,11 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+from __future__ import unicode_literals
 import os
 import time
+import logging
 import multiprocessing
 # NOTE(paddle-dev): All of these flags should be
@@ -32,11 +34,12 @@ import reader.task_reader as task_reader
 from model.ernie import ErnieConfig
 from finetune.mrc import create_model, evaluate
 from optimization import optimization
-from utils.args import print_arguments
+from utils.args import print_arguments, prepare_logger
 from utils.init import init_pretraining_params, init_checkpoint
 from finetune_args import parser
 args = parser.parse_args()
+log = logging.getLogger()
 def main(args):
@@ -44,8 +47,9 @@ def main(args):
    ernie_config.print_config()
    if args.use_cuda:
-        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
+        dev_list = fluid.cuda_places()
-        dev_count = fluid.core.get_cuda_device_count()
+        place = dev_list[0]
+        dev_count = len(dev_list)
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
@@ -70,6 +74,8 @@ def main(args):
        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
                         "least one of them must be True.")
+    if args.do_test:
+        assert args.test_save is not None
    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed
@@ -77,27 +83,30 @@ def main(args):
    if args.predict_batch_size == None:
        args.predict_batch_size = args.batch_size
    if args.do_train:
+        trainers_num = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
        train_data_generator = reader.data_generator(
            input_file=args.train_set,
            batch_size=args.batch_size,
            epoch=args.epoch,
-            dev_count=dev_count,
+            dev_count=trainers_num,
            shuffle=True,
            phase="train")
        num_train_examples = reader.get_num_examples("train")
        if args.in_tokens:
+            if args.batch_size < args.max_seq_len:
+                raise ValueError('if in_tokens=True, batch_size should greater than max_sqelen, got batch_size:%d seqlen:%d' % (args.batch_size, args.max_seq_len))
            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // dev_count
        else:
            max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count
        warmup_steps = int(max_train_steps * args.warmup_proportion)
-        print("Device count: %d" % dev_count)
+        log.info("Device count: %d" % dev_count)
-        print("Num train examples: %d" % num_train_examples)
+        log.info("Num train examples: %d" % num_train_examples)
-        print("Max train steps: %d" % max_train_steps)
+        log.info("Max train steps: %d" % max_train_steps)
-        print("Num warmup steps: %d" % warmup_steps)
+        log.info("Num warmup steps: %d" % warmup_steps)
        train_program = fluid.Program()
@@ -108,7 +117,7 @@ def main(args):
                    pyreader_name='train_reader',
                    ernie_config=ernie_config,
                    is_training=True)
-                scheduled_lr, loss_scaling = optimization(
+                scheduled_lr, _ = optimization(
                    loss=graph_vars["loss"],
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
@@ -117,7 +126,13 @@ def main(args):
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
-                    use_fp16=args.use_fp16)
+		    use_fp16=args.use_fp16,
+		    use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
+		    init_loss_scaling=args.init_loss_scaling,
+		    incr_every_n_steps=args.incr_every_n_steps,
+		    decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
+		    incr_ratio=args.incr_ratio,
+		    decr_ratio=args.decr_ratio)
        if args.verbose:
            if args.in_tokens:
@@ -127,7 +142,7 @@ def main(args):
            else:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program, batch_size=args.batch_size)
-            print("Theoretical memory usage in training: %.3f - %.3f %s" %
+            log.info("Theoretical memory usage in training: %.3f - %.3f %s" %
                  (lower_mem, upper_mem, unit))
    if args.do_val or args.do_test:
@@ -144,11 +159,36 @@ def main(args):
    nccl2_num_trainers = 1
    nccl2_trainer_id = 0
+    if args.is_distributed:
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
+        current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
+        worker_endpoints = worker_endpoints_env.split(",")
+        trainers_num = len(worker_endpoints)
+        log.info("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
+              trainer_id:{}".format(worker_endpoints, trainers_num,
+                                    current_endpoint, trainer_id))
+        # prepare nccl2 env.
+        config = fluid.DistributeTranspilerConfig()
+        config.mode = "nccl2"
+        t = fluid.DistributeTranspiler(config=config)
+        t.transpile(
+            trainer_id,
+            trainers=worker_endpoints_env,
+            current_endpoint=current_endpoint,
+            program=train_program if args.do_train else test_prog,
+            startup_program=startup_prog)
+        nccl2_num_trainers = trainers_num
+        nccl2_trainer_id = trainer_id
+    exe = fluid.Executor(place)
    exe.run(startup_prog)
    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
-            print(
+            log.warning(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
@@ -188,7 +228,7 @@ def main(args):
            num_trainers=nccl2_num_trainers,
            trainer_id=nccl2_trainer_id)
-        train_pyreader.decorate_tensor_provider(train_data_generator)
+        train_pyreader.set_batch_generator(train_data_generator)
    else:
        train_exe = None
@@ -214,12 +254,12 @@ def main(args):
                        verbose += "learning rate: %f" % (
                            outputs["learning_rate"]
                            if warmup_steps > 0 else args.learning_rate)
-                        print(verbose)
+                        log.info(verbose)
                    current_example, current_epoch = reader.get_train_progress()
                    time_end = time.time()
                    used_time = time_end - time_begin
-                    print("epoch: %d, progress: %d/%d, step: %d, ave loss: %f, "
+                    log.info("epoch: %d, progress: %d/%d, step: %d, ave loss: %f, "
                          "speed: %f steps/s" %
                          (current_epoch, current_example, num_train_examples,
                           steps, outputs["loss"], args.skip_steps / used_time))
@@ -232,7 +272,7 @@ def main(args):
                if steps % args.validation_steps == 0:
                    if args.do_val:
-                        test_pyreader.decorate_tensor_provider(
+                        test_pyreader.set_batch_generator(
                            reader.data_generator(
                                args.dev_set,
                                batch_size=args.batch_size,
@@ -251,7 +291,7 @@ def main(args):
                            args=args)
                    if args.do_test:
-                        test_pyreader.decorate_tensor_provider(
+                        test_pyreader.set_batch_generator(
                            reader.data_generator(
                                args.test_set,
                                batch_size=args.batch_size,
@@ -277,8 +317,8 @@ def main(args):
    # final eval on dev set
    if args.do_val:
-        print("Final validation result:")
+        log.info("Final validation result:")
-        test_pyreader.decorate_tensor_provider(
+        test_pyreader.set_batch_generator(
            reader.data_generator(
                args.dev_set,
                batch_size=args.batch_size,
@@ -298,8 +338,8 @@ def main(args):
    # final eval on test set
    if args.do_test:
-        print("Final test result:")
+        log.info("Final test result:")
-        test_pyreader.decorate_tensor_provider(
+        test_pyreader.set_batch_generator(
            reader.data_generator(
                args.test_set,
                batch_size=args.batch_size,
@@ -319,7 +359,8 @@ def main(args):
 if __name__ == '__main__':
-    while True:
+    prepare_logger(log)
+    print_arguments(args)
    scope = fluid.core.Scope()
    with fluid.scope_guard(scope):
        main(args)
--- a/run_sequence_labeling.py
+++ b/run_sequence_labeling.py
@@ -16,10 +16,15 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
 import os
 import time
+import six
+import logging
 import multiprocessing
+from io import open
 # NOTE(paddle-dev): All of these flags should be
 # set before `import paddle`. Otherwise, it would
@@ -32,11 +37,12 @@ import reader.task_reader as task_reader
 from model.ernie import ErnieConfig
 from optimization import optimization
 from utils.init import init_pretraining_params, init_checkpoint
-from utils.args import print_arguments, check_cuda
+from utils.args import print_arguments, check_cuda, prepare_logger
-from finetune.sequence_label import create_model, evaluate
+from finetune.sequence_label import create_model, evaluate, predict, calculate_f1
 from finetune_args import parser
 args = parser.parse_args()
+log = logging.getLogger()
 def main(args):
@@ -44,12 +50,12 @@ def main(args):
    ernie_config.print_config()
    if args.use_cuda:
-        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
+        dev_list = fluid.cuda_places()
-        dev_count = fluid.core.get_cuda_device_count()
+        place = dev_list[0]
+        dev_count = len(dev_list)
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-    exe = fluid.Executor(place)
    reader = task_reader.SequenceLabelReader(
        vocab_path=args.vocab_path,
@@ -79,16 +85,19 @@ def main(args):
        num_train_examples = reader.get_num_examples(args.train_set)
        if args.in_tokens:
+            if args.batch_size < args.max_seq_len:
+                raise ValueError('if in_tokens=True, batch_size should greater than max_sqelen, got batch_size:%d seqlen:%d' % (args.batch_size, args.max_seq_len))
            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // dev_count
        else:
            max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count
        warmup_steps = int(max_train_steps * args.warmup_proportion)
-        print("Device count: %d" % dev_count)
+        log.info("Device count: %d" % dev_count)
-        print("Num train examples: %d" % num_train_examples)
+        log.info("Num train examples: %d" % num_train_examples)
-        print("Max train steps: %d" % max_train_steps)
+        log.info("Max train steps: %d" % max_train_steps)
-        print("Num warmup steps: %d" % warmup_steps)
+        log.info("Num warmup steps: %d" % warmup_steps)
        train_program = fluid.Program()
@@ -107,7 +116,13 @@ def main(args):
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
-                    use_fp16=args.use_fp16)
+		    use_fp16=args.use_fp16,
+		    use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
+		    init_loss_scaling=args.init_loss_scaling,
+		    incr_every_n_steps=args.incr_every_n_steps,
+		    decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
+		    incr_ratio=args.incr_ratio,
+		    decr_ratio=args.decr_ratio)
        if args.verbose:
            if args.in_tokens:
@@ -117,7 +132,7 @@ def main(args):
            else:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program, batch_size=args.batch_size)
-            print("Theoretical memory usage in training: %.3f - %.3f %s" %
+            log.info("Theoretical memory usage in training: %.3f - %.3f %s" %
                  (lower_mem, upper_mem, unit))
    if args.do_val or args.do_test:
@@ -131,11 +146,38 @@ def main(args):
        test_prog = test_prog.clone(for_test=True)
+    nccl2_num_trainers = 1
+    nccl2_trainer_id = 0
+    if args.is_distributed:
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
+        current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
+        worker_endpoints = worker_endpoints_env.split(",")
+        trainers_num = len(worker_endpoints)
+        log.info("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
+              trainer_id:{}".format(worker_endpoints, trainers_num,
+                                    current_endpoint, trainer_id))
+        # prepare nccl2 env.
+        config = fluid.DistributeTranspilerConfig()
+        config.mode = "nccl2"
+        t = fluid.DistributeTranspiler(config=config)
+        t.transpile(
+            trainer_id,
+            trainers=worker_endpoints_env,
+            current_endpoint=current_endpoint,
+            program=train_program if args.do_train else test_prog,
+            startup_program=startup_prog)
+        nccl2_num_trainers = trainers_num
+        nccl2_trainer_id = trainer_id
+    exe = fluid.Executor(place)
    exe.run(startup_prog)
    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
-            print(
+            log.info(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
@@ -171,9 +213,11 @@ def main(args):
            use_cuda=args.use_cuda,
            loss_name=graph_vars["loss"].name,
            exec_strategy=exec_strategy,
-            main_program=train_program)
+            main_program=train_program,
+            num_trainers=nccl2_num_trainers,
+            trainer_id=nccl2_trainer_id)
-        train_pyreader.decorate_tensor_provider(train_data_generator)
+        train_pyreader.set_batch_generator(train_data_generator)
    else:
        train_exe = None
@@ -186,7 +230,6 @@ def main(args):
    if args.do_train:
        train_pyreader.start()
        steps = 0
-        if warmup_steps > 0:
        graph_vars["learning_rate"] = scheduled_lr
        time_begin = time.time()
@@ -196,54 +239,47 @@ def main(args):
                if steps % args.skip_steps != 0:
                    train_exe.run(fetch_list=[])
                else:
-                    outputs = evaluate(train_exe, train_program, train_pyreader,
+                    fetch_list = [
-                                       graph_vars, args.num_labels, "train",
+                        graph_vars["num_infer"].name, graph_vars["num_label"].name,
-                                       dev_count)
+                        graph_vars["num_correct"].name,
+                        graph_vars["loss"].name,
+                        graph_vars['learning_rate'].name,
+                    ]
+                    out = train_exe.run(fetch_list=fetch_list)
+                    num_infer, num_label, num_correct, np_loss, np_lr = out
+                    lr = float(np_lr[0])
+                    loss = np_loss.mean()
+                    precision, recall, f1 = calculate_f1(num_label, num_infer, num_correct)
                    if args.verbose:
-                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
+                        log.info("train pyreader queue size: %d, learning rate: %f" % (train_pyreader.queue.size(),
-                        )
+                                lr if warmup_steps > 0 else args.learning_rate))
-                        verbose += "learning rate: %f" % (
-                            outputs["lr"]
-                            if warmup_steps > 0 else args.learning_rate)
-                        print(verbose)
                    current_example, current_epoch = reader.get_train_progress()
                    time_end = time.time()
                    used_time = time_end - time_begin
-                    print("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
+                    log.info("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
                          "f1: %f, precision: %f, recall: %f, speed: %f steps/s"
                          % (current_epoch, current_example, num_train_examples,
-                             steps, outputs["loss"], outputs["f1"],
+                             steps, loss, f1, precision, recall,
-                             outputs["precision"], outputs["recall"],
                             args.skip_steps / used_time))
                    time_begin = time.time()
-                if steps % args.save_steps == 0:
+                if nccl2_trainer_id == 0 and steps % args.save_steps == 0:
                    save_path = os.path.join(args.checkpoints,
                                             "step_" + str(steps))
                    fluid.io.save_persistables(exe, save_path, train_program)
-                if steps % args.validation_steps == 0:
+                if nccl2_trainer_id == 0 and steps % args.validation_steps == 0:
                    # evaluate dev set
                    if args.do_val:
-                        test_pyreader.decorate_tensor_provider(
+                        evaluate_wrapper(reader, exe, test_prog, test_pyreader, graph_vars,
-                            reader.data_generator(
+                                current_epoch, steps)
-                                args.dev_set,
-                                batch_size=args.batch_size,
-                                epoch=1,
-                                shuffle=False))
-                        evaluate(exe, test_prog, test_pyreader, graph_vars,
-                                 args.num_labels, "dev")
                    # evaluate test set
                    if args.do_test:
-                        test_pyreader.decorate_tensor_provider(
+                        predict_wrapper(reader, exe, test_prog, test_pyreader, graph_vars,
-                            reader.data_generator(
+                                current_epoch, steps)
-                                args.test_set,
-                                batch_size=args.batch_size,
-                                epoch=1,
-                                shuffle=False))
-                        evaluate(exe, test_prog, test_pyreader, graph_vars,
-                                 args.num_labels, "test")
            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints, "step_" + str(steps))
@@ -252,31 +288,71 @@ def main(args):
                break
    # final eval on dev set
-    if args.do_val:
+    if nccl2_trainer_id ==0 and args.do_val:
-        test_pyreader.decorate_tensor_provider(
+        if not args.do_train:
+            current_example, current_epoch = reader.get_train_progress()
+        evaluate_wrapper(reader, exe, test_prog, test_pyreader, graph_vars,
+                current_epoch, 'final')
+    if nccl2_trainer_id == 0 and args.do_test:
+        if not args.do_train:
+            current_example, current_epoch = reader.get_train_progress()
+        predict_wrapper(reader, exe, test_prog, test_pyreader, graph_vars,
+                current_epoch, 'final')
+def evaluate_wrapper(reader, exe, test_prog, test_pyreader, graph_vars,
+                     epoch, steps):
+    # evaluate dev set
+    batch_size = args.batch_size if args.predict_batch_size is None else args.predict_batch_size
+    for ds in args.dev_set.split(','): #single card eval
+        test_pyreader.set_batch_generator(
            reader.data_generator(
-                args.dev_set,
+                ds,
-                batch_size=args.batch_size,
+                batch_size=batch_size,
                epoch=1,
+                dev_count=1,
                shuffle=False))
-        print("Final validation result:")
+        log.info("validation result of dataset {}:".format(ds))
-        evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels,
+        info = evaluate(exe, test_prog, test_pyreader, graph_vars,
-                 "dev")
+                 args.num_labels)
+        log.info(info + ', file: {}, epoch: {}, steps: {}'.format(
-    # final eval on test set
+            ds, epoch, steps))
-    if args.do_test:
-        test_pyreader.decorate_tensor_provider(
-            reader.data_generator(
+def predict_wrapper(reader, exe, test_prog, test_pyreader, graph_vars,
-                args.test_set,
+                    epoch, steps):
-                batch_size=args.batch_size,
+    test_sets = args.test_set.split(',')
+    save_dirs = args.test_save.split(',')
+    assert len(test_sets) == len(save_dirs), 'number of test_sets & test_save not match, got %d vs %d' % (len(test_sets), len(save_dirs))
+    batch_size = args.batch_size if args.predict_batch_size is None else args.predict_batch_size
+    for test_f, save_f in zip(test_sets, save_dirs):
+        test_pyreader.set_batch_generator(reader.data_generator(
+                    test_f,
+                    batch_size=batch_size,
                    epoch=1,
+                    dev_count=1,
                    shuffle=False))
-        print("Final test result:")
-        evaluate(exe, test_prog, test_pyreader, graph_vars, args.num_labels,
-                 "test")
+        save_path = save_f + '.' + str(epoch) + '.' + str(steps)
+        log.info("testing {}, save to {}".format(test_f, save_path))
+        res = predict(exe, test_prog, test_pyreader, graph_vars, dev_count=1)
+        save_dir = os.path.dirname(save_path)
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+        tokenizer = reader.tokenizer
+        rev_label_map = {v: k for k, v in six.iteritems(reader.label_map)}
+        with open(save_path, 'w', encoding='utf8') as f:
+            for id, s, p in res:
+                id = ' '.join(tokenizer.convert_ids_to_tokens(id))
+                p = ' '.join(['%.5f' % pp[ss] for ss, pp in zip(s, p)])
+                s = ' '.join([rev_label_map[ss]for ss in s])
+                f.write('{}\t{}\t{}\n'.format(id, s, p))
 if __name__ == '__main__':
+    prepare_logger(log)
    print_arguments(args)
    check_cuda(args.use_cuda)
    main(args)
--- a/ernie/service/client.py
+++ b/ernie/service/client.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+import sys
+import os
+import argparse
+from propeller.service.client import InferenceClient
+from propeller import log
+import six
+import utils.data
+from time import time
+import numpy as np
+class ErnieClient(InferenceClient):
+    def __init__(self, 
+            vocab_file, 
+            host='localhost', 
+            port=8888, 
+            batch_size=32, 
+            num_coroutine=1, 
+            timeout=10., 
+            max_seqlen=128):
+        host_port = 'tcp://%s:%d' % (host, port) 
+        client = super(ErnieClient, self).__init__(host_port, batch_size=batch_size, num_coroutine=num_coroutine, timeout=timeout)
+        self.vocab = {j.strip().split(b'\t')[0].decode('utf8'): i for i, j in enumerate(open(vocab_file, 'rb'))}
+        self.tokenizer = utils.data.CharTokenizer(self.vocab.keys())
+        self.max_seqlen = max_seqlen
+        self.cls_id = self.vocab['[CLS]']
+        self.sep_id = self.vocab['[SEP]']
+    def txt_2_id(self, text):
+        ids = np.array([self.vocab[i] for i in self.tokenizer(text)])
+        return ids
+    def pad_and_batch(self, ids):
+        max_len = max(map(len, ids))
+        padded = np.stack([np.pad(i, [[0, max_len - len(i)]], mode='constant')for i in ids])
+        padded = np.expand_dims(padded, axis=-1)
+        return padded
+    def __call__(self, text_a, text_b=None):
+        if text_b is not None and len(text_a) != len(text_b):
+            raise ValueError('text_b %d has different size than text_a %d' % (text_b, text_a))
+        text_a = [i.encode('utf8') if isinstance(i, six.string_types) else i for i in text_a]
+        if text_b is not None:
+            text_b = [i.encode('utf8') if isinstance(i, six.string_types) else i for i in text_b]
+        ids_a = map(self.txt_2_id, text_a)
+        if text_b is not None:
+            ids_b = map(self.txt_2_id, text_b)
+            ret = [utils.data.build_2_pair(a, b, self.max_seqlen, self.cls_id, self.sep_id) for a, b in zip(ids_a, ids_b)]
+        else:
+            ret = [utils.data.build_1_pair(a, self.max_seqlen, self.cls_id, self.sep_id) for a in ids_a]
+        sen_ids, token_type_ids = zip(*ret)
+        sen_ids = self.pad_and_batch(sen_ids)
+        token_type_ids = self.pad_and_batch(token_type_ids)
+        ret, = super(ErnieClient, self).__call__(sen_ids, token_type_ids)
+        return ret
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='ernie_encoder_client')
+    parser.add_argument('--host', type=str, default='localhost')
+    parser.add_argument('-i', '--input', type=str, required=True)
+    parser.add_argument('-o', '--output', type=str, required=True)
+    parser.add_argument('-p', '--port', type=int, default=8888)
+    parser.add_argument('--batch_size', type=int, default=32)
+    parser.add_argument('--num_coroutine', type=int, default=1)
+    parser.add_argument('--vocab', type=str, required=True)
+    args = parser.parse_args()
+    client = ErnieClient(args.vocab, args.host, args.port, batch_size=args.batch_size, num_coroutine=args.num_coroutine)
+    inputs = [i.strip().split(b'\t') for i in open(args.input, 'rb').readlines()]
+    if len(inputs) == 0:
+        raise ValueError('empty input')
+    send_batch = args.num_coroutine * args.batch_size
+    send_num = len(inputs) // send_batch + 1
+    rets = []
+    start = time()
+    for i in range(send_num):
+        slice = inputs[i * send_batch: (i + 1) * send_batch]
+        if len(slice) == 0:
+            continue
+        columns = list(zip(*slice))
+        if len(columns) > 2:
+            raise ValueError('inputs file has more than 2 columns')
+        ret = client(*columns)
+        if len(ret.shape) == 3:
+            ret = ret[:, 0, :] # take cls
+        rets.append(ret)
+    end = time()
+    with open(args.output, 'wb') as outf:
+        arr = np.concatenate(rets, 0)
+        np.save(outf, arr)
+        log.info('query num: %d average latency %.5f' % (len(inputs), (end - start)/len(inputs)))
--- a/ernie/service/encoder_server.py
+++ b/ernie/service/encoder_server.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+import sys
+import os
+import argparse
+import logging
+import logging.handlers
+import re
+from propeller.service.server import InferenceServer
+from propeller import log
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-m', '--model_dir', type=str, required=True)
+    parser.add_argument('-p', '--port', type=int, default=8888)
+    parser.add_argument('-v', '--verbose', action='store_true')
+    parser.add_argument('--encode_layer', type=str, choices=[
+        'pooler', 
+        'layer12', 
+        'layer11',
+        'layer10',
+        'layer9',
+        'layer8',
+        'layer7',
+        'layer6',
+        'layer5',
+        'layer4',
+        'layer3',
+        'layer2',
+        'layer1',
+        ], default='pooler')
+    args = parser.parse_args()
+    if args.verbose:
+        log.setLevel(logging.DEBUG)
+    cuda_env = os.getenv("CUDA_VISIBLE_DEVICES")
+    if cuda_env is None:
+        raise RuntimeError('CUDA_VISIBLE_DEVICES not set')
+    if not os.path.exists(args.model_dir):
+        raise ValueError('model_dir not found: %s' % args.model_dir)
+    if not os.path.exists(args.model_dir):
+        raise ValueError('model_dir not found: %s' % args.model_dir)
+    n_devices = len(cuda_env.split(","))
+    if args.encode_layer.lower() == 'pooler':
+        model_dir = os.path.join(args.model_dir, 'pooler')
+    else:
+        pat = re.compile(r'layer(\d+)')
+        match = pat.match(args.encode_layer.lower())
+        layer = int(match.group(1))
+        model_dir = os.path.join(args.model_dir, 'enc%d' % layer)
+    server = InferenceServer(model_dir, n_devices)
+    log.info('propeller server listent on port %d' % args.port)
+    server.listen(args.port)
--- a/tokenization.py
+++ b/tokenization.py
@@ -17,6 +17,10 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+from io import open
 import collections
 import unicodedata
@@ -69,7 +73,7 @@ def printable_text(text):
 def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()
-    fin = open(vocab_file)
+    with open(vocab_file, encoding='utf8') as fin:
        for num, line in enumerate(fin):
            items = convert_to_unicode(line.strip()).split("\t")
            if len(items) > 2:

--- a/train.py
+++ b/train.py
@@ -12,14 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ERNIE pretraining."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
 import os
 import time
 import multiprocessing
+import logging
 import numpy as np
 import paddle.fluid as fluid
@@ -27,31 +29,29 @@ import paddle.fluid as fluid
 from reader.pretraining import ErnieDataReader
 from model.ernie_v1 import ErnieModel, ErnieConfig
 from optimization import optimization
-from utils.args import print_arguments, check_cuda
+from utils.args import print_arguments, check_cuda, prepare_logger
 from utils.init import init_checkpoint, init_pretraining_params
 from pretrain_args import parser
+log = logging.getLogger()
 args = parser.parse_args()
 # yapf: enable.
 def create_model(pyreader_name, ernie_config):
-    pyreader = fluid.layers.py_reader(
+    src_ids = fluid.layers.data(name='1', shape=[-1, args.max_seq_len, 1], dtype='int64')
-        capacity=70,
+    pos_ids = fluid.layers.data(name='2', shape=[-1, args.max_seq_len, 1], dtype='int64')
-        shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1],
+    sent_ids= fluid.layers.data(name='3', shape=[-1, args.max_seq_len, 1], dtype='int64')
-                [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1],
+    input_mask = fluid.layers.data(name='4', shape=[-1, args.max_seq_len, 1], dtype='float32')
-                [-1, 1], [-1, 1]],
+    mask_label = fluid.layers.data(name='5', shape=[-1, 1], dtype='int64')
-        dtypes=[
+    mask_pos = fluid.layers.data(name='6', shape=[-1, 1], dtype='int64')
-            'int64', 'int64', 'int64', 'float32', 'int64', 'int64', 'int64'
+    labels = fluid.layers.data(name='r', shape=[-1, 1], dtype='int64')
-        ],
-        lod_levels=[0, 0, 0, 0, 0, 0, 0],
+    pyreader = fluid.io.DataLoader.from_generator(feed_list=[
-        name=pyreader_name,
+        src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels
-        use_double_buffer=True)
+        ], capacity=70, iterable=False)
-    (src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos,
-     labels) = fluid.layers.read_file(pyreader)
    ernie = ErnieModel(
        src_ids=src_ids,
@@ -65,9 +65,6 @@ def create_model(pyreader_name, ernie_config):
    next_sent_acc, mask_lm_loss, total_loss = ernie.get_pretraining_output(
        mask_label, mask_pos, labels)
-    if args.use_fp16 and args.loss_scaling > 1.0:
-        total_loss *= args.loss_scaling
    return pyreader, next_sent_acc, mask_lm_loss, total_loss
@@ -97,7 +94,7 @@ def predict_wrapper(args,
    def predict(exe=exe, pyreader=pyreader):
-        pyreader.decorate_tensor_provider(data_reader.data_generator())
+        pyreader.set_batch_generator(data_reader.data_generator())
        pyreader.start()
        cost = 0
@@ -114,7 +111,7 @@ def predict_wrapper(args,
                cost += each_total_cost
                steps += 1
                if args.do_test and steps % args.skip_steps == 0:
-                    print("[test_set] steps: %d" % steps)
+                    log.info("[test_set] steps: %d" % steps)
            except fluid.core.EOFException:
                pyreader.reset()
@@ -151,9 +148,9 @@ def test(args):
        pyreader=test_pyreader,
        fetch_list=[next_sent_acc.name, mask_lm_loss.name, total_loss.name])
-    print("test begin")
+    log.info("test begin")
    loss, lm_loss, acc, steps, speed = predict()
-    print(
+    log.info(
        "[test_set] loss: %f, global ppl: %f, next_sent_acc: %f, speed: %f steps/s"
        % (np.mean(np.array(loss) / steps),
           np.exp(np.mean(np.array(lm_loss) / steps)),
@@ -161,7 +158,7 @@ def test(args):
 def train(args):
-    print("pretraining start")
+    log.info("pretraining start")
    ernie_config = ErnieConfig(args.ernie_config_path)
    ernie_config.print_config()
@@ -171,7 +168,7 @@ def train(args):
        with fluid.unique_name.guard():
            train_pyreader, next_sent_acc, mask_lm_loss, total_loss = create_model(
                pyreader_name='train_reader', ernie_config=ernie_config)
-            scheduled_lr, loss_scaling = optimization(
+            scheduled_lr, _ = optimization(
                loss=total_loss,
                warmup_steps=args.warmup_steps,
                num_train_steps=args.num_train_steps,
@@ -180,13 +177,14 @@ def train(args):
                startup_prog=startup_prog,
                weight_decay=args.weight_decay,
                scheduler=args.lr_scheduler,
-                use_fp16=args.use_fp16)
+                use_fp16=args.use_fp16,
+                use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
+                init_loss_scaling=args.init_loss_scaling,
+                incr_every_n_steps=args.incr_every_n_steps,
+                decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
+                incr_ratio=args.incr_ratio,
+                decr_ratio=args.decr_ratio)
-            fluid.memory_optimize(
-                input_program=train_program,
-                skip_opt_set=[
-                    next_sent_acc.name, mask_lm_loss.name, total_loss.name
-                ])
    test_prog = fluid.Program()
    with fluid.program_guard(test_prog, startup_prog):
@@ -196,31 +194,34 @@ def train(args):
    test_prog = test_prog.clone(for_test=True)
+    if len(fluid.cuda_places()) == 0:
+        raise RuntimeError('not cuda device cound, check ur env setting')
    if args.use_cuda:
-        place = fluid.CUDAPlace(0)
+        place = fluid.cuda_places()[0]
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-    print("Device count %d" % dev_count)
+    log.info("Device count %d" % dev_count)
-    print("theoretical memory usage: ")
+    log.info("theoretical memory usage: ")
-    print(fluid.contrib.memory_usage(
+    log.info(fluid.contrib.memory_usage(
        program=train_program, batch_size=args.batch_size // args.max_seq_len))
    nccl2_num_trainers = 1
    nccl2_trainer_id = 0
-    print("args.is_distributed:", args.is_distributed)
+    log.info("args.is_distributed: %s" % args.is_distributed)
    if args.is_distributed:
-        worker_endpoints_env = os.getenv("worker_endpoints")
+        worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
        worker_endpoints = worker_endpoints_env.split(",")
        trainers_num = len(worker_endpoints)
-        current_endpoint = os.getenv("current_endpoint")
+        current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
        trainer_id = worker_endpoints.index(current_endpoint)
        if trainer_id == 0:
-            print("train_id == 0, sleep 60s")
+            log.info("train_id == 0, sleep 60s")
            time.sleep(60)
-        print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
+        log.info("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
              trainer_id:{}".format(worker_endpoints, trainers_num,
                                    current_endpoint, trainer_id))
@@ -281,7 +282,7 @@ def train(args):
                next_sent_acc.name, mask_lm_loss.name, total_loss.name
            ])
-    train_pyreader.decorate_tensor_provider(data_reader.data_generator())
+    train_pyreader.set_batch_generator(data_reader.data_generator())
    train_pyreader.start()
    steps = 0
    cost = []
@@ -309,13 +310,13 @@ def train(args):
                lm_cost.extend(each_mask_lm_cost)
                cost.extend(each_total_cost)
-                print("feed_queue size", train_pyreader.queue.size())
+                log.info("feed_queue size %d" % train_pyreader.queue.size())
                time_end = time.time()
                used_time = time_end - time_begin
                epoch, current_file_index, total_file, current_file, mask_type = data_reader.get_progress(
                )
-                print("current learning_rate:%f" % np_lr[0])
+                log.info("current learning_rate:%f" % np_lr[0])
-                print(
+                log.info(
                    "epoch: %d, progress: %d/%d, step: %d, loss: %f, "
                    "ppl: %f, next_sent_acc: %f, speed: %f steps/s, file: %s, mask_type: %s"
                    % (epoch, current_file_index, total_file, steps,
@@ -335,7 +336,7 @@ def train(args):
            if args.valid_filelist and steps % args.validation_steps == 0:
                vali_cost, vali_lm_cost, vali_acc, vali_steps, vali_speed = predict(
                )
-                print("[validation_set] epoch: %d, step: %d, "
+                log.info("[validation_set] epoch: %d, step: %d, "
                      "loss: %f, global ppl: %f, batch-averged ppl: %f, "
                      "next_sent_acc: %f, speed: %f steps/s" %
                      (epoch, steps, np.mean(np.array(vali_cost) / vali_steps),
@@ -349,6 +350,7 @@ def train(args):
 if __name__ == '__main__':
+    prepare_logger(log)
    print_arguments(args)
    check_cuda(args.use_cuda)
    if args.do_test:

--- a/reader/__init__.py
+++ b/reader/__init__.py
--- a/utils/args.py
+++ b/utils/args.py
@@ -12,17 +12,37 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Arguments for configuration."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
 import six
+import os
+import sys
 import argparse
+import logging
 import paddle.fluid as fluid
+log = logging.getLogger(__name__)
+def prepare_logger(logger, debug=False, save_to_file=None):
+    formatter = logging.Formatter(fmt='[%(levelname)s] %(asctime)s [%(filename)12s:%(lineno)5d]:\t%(message)s')
+    console_hdl = logging.StreamHandler()
+    console_hdl.setFormatter(formatter)
+    logger.addHandler(console_hdl)
+    if save_to_file is not None and not os.path.exists(save_to_file):
+        file_hdl = logging.FileHandler(save_to_file)
+        file_hdl.setFormatter(formatter)
+        logger.addHandler(file_hdl)
+    logger.setLevel(logging.DEBUG)
+    logger.propagate = False
 def str2bool(v):
    # because argparse does not support to parse "true, False" as python
    # boolean directly
@@ -33,10 +53,11 @@ class ArgumentGroup(object):
    def __init__(self, parser, title, des):
        self._group = parser.add_argument_group(title=title, description=des)
-    def add_arg(self, name, type, default, help, **kwargs):
+    def add_arg(self, name, type, default, help, positional_arg=False, **kwargs):
+        prefix = "" if positional_arg else "--"
        type = str2bool if type == bool else type
        self._group.add_argument(
-            "--" + name,
+            prefix + name,
            default=default,
            type=type,
            help=help + ' Default: %(default)s.',
@@ -44,10 +65,10 @@ class ArgumentGroup(object):
 def print_arguments(args):
-    print('-----------  Configuration Arguments -----------')
+    log.info('-----------  Configuration Arguments -----------')
    for arg, value in sorted(six.iteritems(vars(args))):
-        print('%s: %s' % (arg, value))
+        log.info('%s: %s' % (arg, value))
-    print('------------------------------------------------')
+    log.info('------------------------------------------------')
 def check_cuda(use_cuda, err = \
@@ -56,7 +77,7 @@ def check_cuda(use_cuda, err = \
                                                                                                                     ):
    try:
        if use_cuda == True and fluid.is_compiled_with_cuda() == False:
-            print(err)
+            log.error(err)
            sys.exit(1)
    except Exception as e:
        pass
--- a/utils/cards.py
+++ b/utils/cards.py
@@ -11,7 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
 import os

--- a/utils/cmrc2018_eval.py
+++ b/utils/cmrc2018_eval.py
 # -*- coding: utf-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 '''
 Evaluation script for CMRC 2018
 version: v5
@@ -6,22 +19,25 @@ Note:
 v5 formatted output, add usage description
 v4 fixed segmentation issues
 '''
+from __future__ import absolute_import
+from __future__ import division
 from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
 from collections import Counter, OrderedDict
 import string
 import re
 import argparse
 import json
 import sys
-reload(sys)
-sys.setdefaultencoding('utf8')
 import nltk
 import pdb
 # split Chinese with English
 def mixed_segmentation(in_str, rm_punc=False):
-    in_str = str(in_str).decode('utf-8').lower().strip()
+    in_str = in_str.lower().strip()
    segs_out = []
    temp_str = ""
    sp_char = [
@@ -32,7 +48,7 @@ def mixed_segmentation(in_str, rm_punc=False):
    for char in in_str:
        if rm_punc and char in sp_char:
            continue
-        if re.search(ur'[\u4e00-\u9fa5]', char) or char in sp_char:
+        if re.search(r'[\u4e00-\u9fa5]', char) or char in sp_char:
            if temp_str != "":
                ss = nltk.word_tokenize(temp_str)
                segs_out.extend(ss)
@@ -51,7 +67,7 @@ def mixed_segmentation(in_str, rm_punc=False):
 # remove punctuation
 def remove_punctuation(in_str):
-    in_str = str(in_str).decode('utf-8').lower().strip()
+    in_str = in_str.lower().strip()
    sp_char = [
        '-', ':', '_', '*', '^', '/', '\\', '~', '`', '+', '=', '，', '。', '：',
        '？', '！', '“', '”', '；', '’', '《', '》', '……', '·', '、', '「', '」', '（',
@@ -102,7 +118,7 @@ def evaluate(ground_truth_file, prediction_file):
                    skip_count += 1
                    continue
-                prediction = str(prediction_file[query_id])
+                prediction = prediction_file[query_id]
                f1 += calc_f1_score(answers, prediction)
                em += calc_em_score(answers, prediction)
@@ -139,8 +155,8 @@ def calc_em_score(answers, prediction):
 def eval_file(dataset_file, prediction_file):
-    ground_truth_file = json.load(open(dataset_file, 'rb'))
+    ground_truth_file = json.load(open(dataset_file, 'r'))
-    prediction_file = json.load(open(prediction_file, 'rb'))
+    prediction_file = json.load(open(prediction_file, 'r'))
    F1, EM, TOTAL, SKIP = evaluate(ground_truth_file, prediction_file)
    AVG = (EM + F1) * 0.5
    return EM, F1, AVG, TOTAL

--- a/ernie/utils/data.py
+++ b/ernie/utils/data.py
+import sys
+import numpy as np
+import re
+from propeller import log
+import itertools
+from propeller.paddle.data import Dataset
+import pickle
+import six
+if six.PY2:
+    import operator
+    def accumulate(iterable, func=operator.add, initial=None):
+        'Return running totals'
+        # accumulate([1,2,3,4,5]) --> 1 3 6 10 15
+        # accumulate([1,2,3,4,5], initial=100) --> 100 101 103 106 110 115
+        # accumulate([1,2,3,4,5], operator.mul) --> 1 2 6 24 120
+        it = iter(iterable)
+        total = initial
+        if initial is None:
+            try:
+                total = next(it)
+            except StopIteration:
+                return
+        yield total
+        for element in it:
+            total = func(total, element)
+            yield total
+else:
+    from itertools import accumulate
+max_input_chars_per_word=100
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a peice of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+def wordpiece(token, vocab, unk_token, sentencepiece_style_vocab=False):
+    """call with single word"""
+    chars = list(token)
+    if len(chars) > max_input_chars_per_word:
+        return [unk_token], [(0, len(chars))]
+    is_bad = False
+    start = 0
+    sub_tokens = []
+    sub_pos = []
+    while start < len(chars):
+        end = len(chars)
+        cur_substr = None
+        while start < end:
+            substr = "".join(chars[start:end])
+            if start == 0 and sentencepiece_style_vocab:
+                substr = u'\u2581' + substr
+            if start > 0 and not sentencepiece_style_vocab:
+                substr = "##" + substr
+            if substr in vocab:
+                cur_substr = substr
+                break
+            end -= 1
+        if cur_substr is None:
+            is_bad = True
+            break
+        sub_tokens.append(cur_substr)
+        sub_pos.append((start, end))
+        start = end
+    if is_bad:
+        return [unk_token], [(0, len(chars))]
+    else:
+        return sub_tokens, sub_pos
+class SpaceTokenizer(object):
+    def __init__(self, vocab, lower=True):
+        """
+        char tokenizer (wordpiece english)
+        normed txt(space seperated or not) => list of word-piece
+        """
+        self.vocab = set(vocab)
+        self.lower = lower
+    def __call__(self, sen):
+        if len(sen) == 0:
+            return [] #empty line
+        sen = sen.decode('utf8')
+        if self.lower:
+            sen = sen.lower()
+        res = []
+        for s in sen.split(' '):
+            if s == ' ':
+                continue
+            if s in self.vocab:
+                res.append(s)
+            else:
+                res.append('[UNK]')
+        return res
+class CharTokenizer(object):
+    def __init__(self, vocab, lower=True, sentencepiece_style_vocab=False):
+        """
+        char tokenizer (wordpiece english)
+        normed txt(space seperated or not) => list of word-piece
+        """
+        self.vocab = set(vocab)
+        #self.pat = re.compile(r'([,.!?\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b]|[\u4e00-\u9fa5]|[a-zA-Z0-9]+)')
+        self.pat =  re.compile(r'([a-zA-Z0-9]+|\S)')
+        self.lower = lower
+        self.sentencepiece_style_vocab = sentencepiece_style_vocab
+    def __call__(self, sen):
+        if len(sen) == 0:
+            return [] #empty line
+        sen = sen.decode('utf8')
+        if self.lower:
+            sen = sen.lower()
+        res = []
+        for match in self.pat.finditer(sen):
+            words, _ = wordpiece(match.group(0), vocab=self.vocab, unk_token='[UNK]', sentencepiece_style_vocab=self.sentencepiece_style_vocab)
+            res.extend(words)
+        return res
+class WSSPTokenizer(object):
+    def __init__(self, sp_model_dir, word_dict, ws=True, lower=True):
+        self.ws = ws
+        self.lower = lower
+        self.dict = pickle.load(open(word_dict, 'rb'), encoding='utf8')
+        import sentencepiece as spm
+        self.sp_model = spm.SentencePieceProcessor()
+        self.window_size = 5
+        self.sp_model.Load(sp_model_dir)
+    def cut(self, chars):
+        words = []
+        idx = 0
+        while idx < len(chars):
+            matched = False
+            for i in range(self.window_size, 0, -1):
+                cand = chars[idx: idx+i]
+                if cand in self.dict:
+                    words.append(cand)
+                    matched = True
+                    break
+            if not matched: 
+                i = 1
+                words.append(chars[idx])
+            idx += i
+        return words
+    def __call__(self, sen):
+        sen = sen.decode('utf8')
+        if self.ws:
+            sen = [s for s in self.cut(sen) if s != ' ']
+        else:
+            sen = sen.split(' ')
+        if self.lower:
+            sen = [s.lower() for s in sen]
+        sen = ' '.join(sen)
+        ret = self.sp_model.EncodeAsPieces(sen)
+        return ret
+def build_2_pair(seg_a, seg_b, max_seqlen, cls_id, sep_id):
+    token_type_a = np.ones_like(seg_a, dtype=np.int64) * 0
+    token_type_b = np.ones_like(seg_b, dtype=np.int64) * 1
+    sen_emb = np.concatenate([[cls_id], seg_a, [sep_id], seg_b, [sep_id]], 0)
+    token_type_emb = np.concatenate([[0], token_type_a, [0], token_type_b, [1]], 0)
+    seqlen = sen_emb.shape[0]
+    #random truncate
+    random_begin = 0 #np.random.randint(0, np.maximum(0, seqlen - max_seqlen) + 1,)
+    sen_emb = sen_emb[random_begin: random_begin + max_seqlen]
+    token_type_emb = token_type_emb[random_begin: random_begin + max_seqlen]
+    return sen_emb, token_type_emb
+def build_1_pair(seg_a, max_seqlen, cls_id, sep_id):
+    token_type_a = np.ones_like(seg_a, dtype=np.int64) * 0
+    sen_emb = np.concatenate([[cls_id], seg_a, [sep_id]], 0)
+    token_type_emb = np.concatenate([[0], token_type_a, [0]], 0)
+    seqlen = sen_emb.shape[0]
+    #random truncate
+    random_begin = 0 #np.random.randint(0, np.maximum(0, seqlen - max_seqlen) + 1,)
+    sen_emb = sen_emb[random_begin: random_begin + max_seqlen]
+    token_type_emb = token_type_emb[random_begin: random_begin + max_seqlen]
+    return sen_emb, token_type_emb
+def expand_dims(*args):
+    func = lambda i: np.expand_dims(i, -1)
+    ret = [func(i) for i in args]
+    return ret
+def interleave(ds1, ds2):
+    def gen():
+        for i, j in six.moves.zip_longest(iter(ds1), iter(ds2)):
+            if i is not None:
+                yield i
+            if j is not None:
+                yield j
+    return Dataset.from_generator_func(gen)
--- a/ernie/utils/fp16.py
+++ b/ernie/utils/fp16.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import paddle
+import paddle.fluid as fluid
+def append_cast_op(i, o, prog):
+    """
+    Append a cast op in a given Program to cast input `i` to data type `o.dtype`.
+    Args:
+        i (Variable): The input Variable.
+        o (Variable): The output Variable.
+        prog (Program): The Program to append cast op.
+    """
+    prog.global_block().append_op(
+        type="cast",
+        inputs={"X": i},
+        outputs={"Out": o},
+        attrs={"in_dtype": i.dtype,
+               "out_dtype": o.dtype})
+def copy_to_master_param(p, block):
+    v = block.vars.get(p.name, None)
+    if v is None:
+        raise ValueError("no param name %s found!" % p.name)
+    new_p = fluid.framework.Parameter(
+        block=block,
+        shape=v.shape,
+        dtype=fluid.core.VarDesc.VarType.FP32,
+        type=v.type,
+        lod_level=v.lod_level,
+        stop_gradient=p.stop_gradient,
+        trainable=p.trainable,
+        optimize_attr=p.optimize_attr,
+        regularizer=p.regularizer,
+        gradient_clip_attr=p.gradient_clip_attr,
+        error_clip=p.error_clip,
+        name=v.name + ".master")
+    return new_p
+def apply_dynamic_loss_scaling(loss_scaling, master_params_grads,
+                               incr_every_n_steps, decr_every_n_nan_or_inf,
+                               incr_ratio, decr_ratio):
+    _incr_every_n_steps = fluid.layers.fill_constant(
+        shape=[1], dtype='int32', value=incr_every_n_steps)
+    _decr_every_n_nan_or_inf = fluid.layers.fill_constant(
+        shape=[1], dtype='int32', value=decr_every_n_nan_or_inf)
+    _num_good_steps = fluid.layers.create_global_var(
+        name=fluid.unique_name.generate("num_good_steps"),
+        shape=[1],
+        value=0,
+        dtype='int32',
+        persistable=True)
+    _num_bad_steps = fluid.layers.create_global_var(
+        name=fluid.unique_name.generate("num_bad_steps"),
+        shape=[1],
+        value=0,
+        dtype='int32',
+        persistable=True)
+    grads = [fluid.layers.reduce_sum(g) for [_, g] in master_params_grads]
+    all_grads = fluid.layers.concat(grads)
+    all_grads_sum = fluid.layers.reduce_sum(all_grads)
+    is_overall_finite = fluid.layers.isfinite(all_grads_sum)
+    update_loss_scaling(is_overall_finite, loss_scaling, _num_good_steps,
+                        _num_bad_steps, _incr_every_n_steps,
+                        _decr_every_n_nan_or_inf, incr_ratio, decr_ratio)
+    # apply_gradient append all ops in global block, thus we shouldn't
+    # apply gradient in the switch branch.
+    with fluid.layers.Switch() as switch:
+        with switch.case(is_overall_finite):
+            pass
+        with switch.default():
+            for _, g in master_params_grads:
+                fluid.layers.assign(fluid.layers.zeros_like(g), g)
+def create_master_params_grads(params_grads, main_prog, startup_prog,
+                               loss_scaling):
+    master_params_grads = []
+    for p, g in params_grads:
+        with main_prog._optimized_guard([p, g]):
+        # create master parameters
+            master_param = copy_to_master_param(p, main_prog.global_block())
+            startup_master_param = startup_prog.global_block()._clone_variable(
+                master_param)
+            startup_p = startup_prog.global_block().var(p.name)
+            append_cast_op(startup_p, startup_master_param, startup_prog)
+            # cast fp16 gradients to fp32 before apply gradients
+            if g.name.find("layer_norm") > -1:
+                scaled_g = g / loss_scaling
+                master_params_grads.append([p, scaled_g])
+                continue
+            master_grad = fluid.layers.cast(g, "float32")
+            master_grad = master_grad / loss_scaling
+            master_params_grads.append([master_param, master_grad])
+    return master_params_grads
+def master_param_to_train_param(master_params_grads, params_grads, main_prog):
+    for idx, m_p_g in enumerate(master_params_grads):
+        train_p, _ = params_grads[idx]
+        if train_p.name.find("layer_norm") > -1:
+            continue
+        with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]):
+            append_cast_op(m_p_g[0], train_p, main_prog)
+def update_loss_scaling(is_overall_finite, prev_loss_scaling, num_good_steps,
+                        num_bad_steps, incr_every_n_steps,
+                        decr_every_n_nan_or_inf, incr_ratio, decr_ratio):
+    """
+    Update loss scaling according to overall gradients. If all gradients is 
+    finite after incr_every_n_steps, loss scaling will increase by incr_ratio. 
+    Otherwisw, loss scaling will decrease by decr_ratio after 
+    decr_every_n_nan_or_inf steps and each step some gradients are infinite.
+    Args:
+        is_overall_finite (Variable): A boolean variable indicates whether 
+                                     all gradients are finite.
+        prev_loss_scaling (Variable): Previous loss scaling.
+        num_good_steps (Variable): A variable accumulates good steps in which 
+                                   all gradients are finite.
+        num_bad_steps (Variable): A variable accumulates bad steps in which 
+                                  some gradients are infinite.
+        incr_every_n_steps (Variable): A variable represents increasing loss 
+                                       scaling every n consecutive steps with 
+                                       finite gradients.
+        decr_every_n_nan_or_inf (Variable): A variable represents decreasing 
+                                            loss scaling every n accumulated 
+                                            steps with nan or inf gradients.
+        incr_ratio(float): The multiplier to use when increasing the loss 
+                           scaling.
+        decr_ratio(float): The less-than-one-multiplier to use when decreasing 
+                           loss scaling.
+    """
+    zero_steps = fluid.layers.fill_constant(shape=[1], dtype='int32', value=0)
+    with fluid.layers.Switch() as switch:
+        with switch.case(is_overall_finite):
+            should_incr_loss_scaling = fluid.layers.less_than(
+                incr_every_n_steps, num_good_steps + 1)
+            with fluid.layers.Switch() as switch1:
+                with switch1.case(should_incr_loss_scaling):
+                    new_loss_scaling = prev_loss_scaling * incr_ratio
+                    loss_scaling_is_finite = fluid.layers.isfinite(
+                        new_loss_scaling)
+                    with fluid.layers.Switch() as switch2:
+                        with switch2.case(loss_scaling_is_finite):
+                            fluid.layers.assign(new_loss_scaling,
+                                                prev_loss_scaling)
+                        with switch2.default():
+                            pass
+                    fluid.layers.assign(zero_steps, num_good_steps)
+                    fluid.layers.assign(zero_steps, num_bad_steps)
+                with switch1.default():
+                    fluid.layers.increment(num_good_steps)
+                    fluid.layers.assign(zero_steps, num_bad_steps)
+        with switch.default():
+            should_decr_loss_scaling = fluid.layers.less_than(
+                decr_every_n_nan_or_inf, num_bad_steps + 1)
+            with fluid.layers.Switch() as switch3:
+                with switch3.case(should_decr_loss_scaling):
+                    new_loss_scaling = prev_loss_scaling * decr_ratio
+                    static_loss_scaling = \
+                        fluid.layers.fill_constant(shape=[1],
+                                             dtype='float32',
+                                             value=1.0)
+                    less_than_one = fluid.layers.less_than(new_loss_scaling,
+                                                           static_loss_scaling)
+                    with fluid.layers.Switch() as switch4:
+                        with switch4.case(less_than_one):
+                            fluid.layers.assign(static_loss_scaling,
+                                                prev_loss_scaling)
+                        with switch4.default():
+                            fluid.layers.assign(new_loss_scaling,
+                                                prev_loss_scaling)
+                    fluid.layers.assign(zero_steps, num_good_steps)
+                    fluid.layers.assign(zero_steps, num_bad_steps)
+                with switch3.default():
+                    fluid.layers.assign(zero_steps, num_good_steps)
+                    fluid.layers.increment(num_bad_steps)
--- a/utils/init.py
+++ b/utils/init.py
@@ -12,25 +12,35 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
 from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
 import os
 import six
 import ast
 import copy
+import logging
 import numpy as np
 import paddle.fluid as fluid
+log = logging.getLogger(__name__)
 def cast_fp32_to_fp16(exe, main_program):
-    print("Cast parameters to float16 data format.")
+    log.info("Cast parameters to float16 data format.")
    for param in main_program.global_block().all_parameters():
        if not param.name.endswith(".master"):
            param_t = fluid.global_scope().find_var(param.name).get_tensor()
            data = np.array(param_t)
-            if param.name.find("layer_norm") == -1:
+            if param.name.startswith("encoder_layer") \
+                    and "layer_norm" not in param.name:
                param_t.set(np.float16(data).view(np.uint16), exe.place)
+            #load fp32
            master_param_var = fluid.global_scope().find_var(param.name + 
                    ".master")
            if master_param_var is not None:
@@ -51,7 +61,7 @@ def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False):
        init_checkpoint_path,
        main_program=main_program,
        predicate=existed_persitables)
-    print("Load model from {}".format(init_checkpoint_path))
+    log.info("Load model from {}".format(init_checkpoint_path))
    if use_fp16:
        cast_fp32_to_fp16(exe, main_program)
@@ -74,7 +84,7 @@ def init_pretraining_params(exe,
        pretraining_params_path,
        main_program=main_program,
        predicate=existed_params)
-    print("Load pretraining parameters from {}.".format(
+    log.info("Load pretraining parameters from {}.".format(
        pretraining_params_path))
    if use_fp16:

--- a/example/finetune_classifier.py
+++ b/example/finetune_classifier.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import re
+import time
+import logging
+from random import random
+from functools import reduce, partial
+import numpy as np
+import multiprocessing
+import paddle
+import paddle.fluid as F
+import paddle.fluid.layers as L
+from model.ernie import ErnieModel
+from optimization import optimization
+import utils.data
+from propeller import log
+import propeller.paddle as propeller
+log.setLevel(logging.DEBUG)
+class ClassificationErnieModel(propeller.train.Model):
+    """propeller Model wraper for paddle-ERNIE """
+    def __init__(self, hparam, mode, run_config):
+        self.hparam = hparam
+        self.mode = mode
+        self.run_config = run_config
+    def forward(self, features):
+        src_ids, sent_ids = features
+        zero = L.fill_constant([1], dtype='int64', value=0)
+        input_mask = L.cast(L.logical_not(L.equal(src_ids, zero)), 'float32') # assume pad id == 0
+        #input_mask = L.unsqueeze(input_mask, axes=[2])
+        d_shape = L.shape(src_ids)
+        seqlen = d_shape[1]
+        batch_size = d_shape[0]
+        pos_ids = L.unsqueeze(L.range(0, seqlen, 1, dtype='int32'), axes=[0])
+        pos_ids = L.expand(pos_ids, [batch_size, 1])
+        pos_ids = L.unsqueeze(pos_ids, axes=[2])
+        pos_ids = L.cast(pos_ids, 'int64')
+        pos_ids.stop_gradient = True
+        input_mask.stop_gradient = True
+        task_ids = L.zeros_like(src_ids) + self.hparam.task_id
+        task_ids.stop_gradient = True
+        ernie = ErnieModel(
+            src_ids=src_ids,
+            position_ids=pos_ids,
+            sentence_ids=sent_ids,
+            task_ids=task_ids,
+            input_mask=input_mask,
+            config=self.hparam,
+            use_fp16=self.hparam['use_fp16']
+        )
+        cls_feats = ernie.get_pooled_output()
+        cls_feats = L.dropout(
+            x=cls_feats,
+            dropout_prob=0.1,
+            dropout_implementation="upscale_in_train"
+        )
+        logits = L.fc(
+            input=cls_feats,
+            size=self.hparam['num_label'],
+            param_attr=F.ParamAttr(
+                name="cls_out_w",
+                initializer=F.initializer.TruncatedNormal(scale=0.02)),
+            bias_attr=F.ParamAttr(
+                name="cls_out_b", initializer=F.initializer.Constant(0.))
+        )
+        propeller.summary.histogram('pred', logits)
+        if self.mode is propeller.RunMode.PREDICT:
+            probs = L.softmax(logits)
+            return probs
+        else:
+            return logits
+    def loss(self, predictions, labels):
+        ce_loss, probs = L.softmax_with_cross_entropy(
+            logits=predictions, label=labels, return_softmax=True)
+        #L.Print(ce_loss, message='per_example_loss')
+        loss = L.mean(x=ce_loss)
+        return loss
+    def backward(self, loss):
+        scheduled_lr, _ = optimization(
+            loss=loss,
+            warmup_steps=int(self.run_config.max_steps * self.hparam['warmup_proportion']),
+            num_train_steps=self.run_config.max_steps,
+            learning_rate=self.hparam['learning_rate'],
+            train_program=F.default_main_program(), 
+            startup_prog=F.default_startup_program(),
+            weight_decay=self.hparam['weight_decay'],
+            scheduler="linear_warmup_decay",)
+        propeller.summary.scalar('lr', scheduled_lr)
+    def metrics(self, predictions, label):
+        predictions = L.argmax(predictions, axis=1)
+        predictions = L.unsqueeze(predictions, axes=[1])
+        acc = propeller.metrics.Acc(label, predictions)
+        #auc = propeller.metrics.Auc(label, predictions)
+        return {'acc': acc}
+if __name__ == '__main__':
+    parser = propeller.ArgumentParser('classify model with ERNIE')
+    parser.add_argument('--max_seqlen', type=int, default=128)
+    parser.add_argument('--data_dir', type=str, required=True)
+    parser.add_argument('--vocab_file', type=str, required=True)
+    parser.add_argument('--do_predict', action='store_true')
+    parser.add_argument('--warm_start_from', type=str)
+    parser.add_argument('--sentence_piece_model', type=str, default=None)
+    parser.add_argument('--word_dict', type=str, default=None)
+    args = parser.parse_args()
+    run_config = propeller.parse_runconfig(args)
+    hparams = propeller.parse_hparam(args)
+    vocab = {j.strip().split(b'\t')[0].decode('utf8'): i for i, j in enumerate(open(args.vocab_file, 'rb'))}
+    sep_id = vocab['[SEP]']
+    cls_id = vocab['[CLS]']
+    unk_id = vocab['[UNK]']
+    if args.sentence_piece_model is not None:
+        if args.word_dict is None:
+            raise ValueError('--word_dict no specified in subword Model')
+        tokenizer = utils.data.WSSPTokenizer(args.sentence_piece_model, args.word_dict, ws=True, lower=True)
+    else:
+        tokenizer = utils.data.CharTokenizer(vocab.keys())
+    def tokenizer_func(inputs):
+        '''avoid pickle error'''
+        ret = tokenizer(inputs)
+        return ret
+    if not args.do_predict:
+        feature_column = propeller.data.FeatureColumns([
+            propeller.data.TextColumn('title',unk_id=unk_id, vocab_dict=vocab, tokenizer=tokenizer_func),
+            propeller.data.LabelColumn('label'),
+        ])
+        def before(seg_a, label):
+            sentence, segments = utils.data.build_1_pair(seg_a, max_seqlen=args.max_seqlen, cls_id=cls_id, sep_id=sep_id)
+            return sentence, segments, label
+        def after(sentence, segments, label):
+            sentence, segments, label = utils.data.expand_dims(sentence, segments, label)
+            return sentence, segments, label
+        log.debug(os.path.join(args.data_dir, 'train'))
+        train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=True, repeat=True, use_gz=False) \
+                                       .map(before) \
+                                       .padded_batch(hparams.batch_size, (0, 0, 0)) \
+                                       .map(after) 
+        dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \
+                                       .map(before) \
+                                       .padded_batch(hparams.batch_size, (0, 0, 0)) \
+                                       .map(after) 
+        shapes = ([-1, args.max_seqlen, 1], [-1, args.max_seqlen, 1], [-1, 1])
+        types = ('int64', 'int64', 'int64')
+        train_ds.data_shapes = shapes
+        train_ds.data_types = types
+        dev_ds.data_shapes = shapes
+        dev_ds.data_types = types
+        varname_to_warmstart = re.compile(r'^encoder.*[wb]_0$|^.*embedding$|^.*bias$|^.*scale$|^pooled_fc.[wb]_0$')
+        warm_start_dir = args.warm_start_from
+        ws = propeller.WarmStartSetting(
+                predicate_fn=lambda v: varname_to_warmstart.match(v.name) and os.path.exists(os.path.join(warm_start_dir, v.name)),
+                from_dir=warm_start_dir
+            )
+        best_exporter = propeller.train.exporter.BestInferenceModelExporter(os.path.join(run_config.model_dir, 'best'), cmp_fn=lambda old, new: new['eval']['acc'] > old['eval']['acc'])
+        propeller.train.train_and_eval(
+                model_class_or_model_fn=ClassificationErnieModel, 
+                params=hparams, 
+                run_config=run_config, 
+                train_dataset=train_ds, 
+                eval_dataset=dev_ds,
+                warm_start_setting=ws, 
+                exporters=[best_exporter])
+        print('dev_acc\t%.5f' % (best_exporter._best['eval']['acc']))
+    else:
+        feature_column = propeller.data.FeatureColumns([
+            propeller.data.TextColumn('title',unk_id=unk_id, vocab_dict=vocab, tokenizer=tokenizer_func),
+            propeller.data.LabelColumn('label'),
+        ])
+        def before(seg_a):
+            sentence, segments = utils.data.build_1_pair(seg_a, max_seqlen=args.max_seqlen, cls_id=cls_id, sep_id=sep_id)
+            return sentence, segments
+        def after(sentence, segments):
+            sentence, segments = utils.data.expand_dims(sentence, segments)
+            return sentence, segments
+        predict_ds = feature_column.build_dataset_from_stdin('predict') \
+                               .map(before) \
+                               .padded_batch(hparams.batch_size, (0, 0)) \
+                               .map(after) 
+        shapes = ([-1, args.max_seqlen, 1], [-1, args.max_seqlen, 1])
+        types = ('int64', 'int64')
+        predict_ds.data_shapes = shapes
+        predict_ds.data_types = types
+        finetuned_model = propeller.Learner(ClassificationErnieModel, run_config, hparams)
+        for logits, in finetuned_model.predict(predict_ds, ckpt=-1): # ckpt=-1 means last step
+            print(np.argmax(logits))
--- a/example/finetune_ner.py
+++ b/example/finetune_ner.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import os
+import re
+import time
+from random import random
+from functools import reduce, partial
+import numpy as np
+import multiprocessing
+import logging
+import six
+import re
+import paddle
+import paddle.fluid as F
+import paddle.fluid.layers as L
+from model.ernie import ErnieModel
+from optimization import optimization
+import utils.data
+from propeller import log
+log.setLevel(logging.DEBUG)
+import propeller.paddle as propeller
+class SequenceLabelErnieModel(propeller.train.Model):
+    """propeller Model wraper for paddle-ERNIE """
+    def __init__(self, hparam, mode, run_config):
+        self.hparam = hparam
+        self.mode = mode
+        self.run_config = run_config
+        self.num_label = len(hparam['label_list'])
+    def forward(self, features):
+        src_ids, sent_ids, input_seqlen = features
+        zero = L.fill_constant([1], dtype='int64', value=0)
+        input_mask = L.cast(L.equal(src_ids, zero), 'float32') # assume pad id == 0
+        #input_mask = L.unsqueeze(input_mask, axes=[2])
+        d_shape = L.shape(src_ids)
+        seqlen = d_shape[1]
+        batch_size = d_shape[0]
+        pos_ids = L.unsqueeze(L.range(0, seqlen, 1, dtype='int32'), axes=[0])
+        pos_ids = L.expand(pos_ids, [batch_size, 1])
+        pos_ids = L.unsqueeze(pos_ids, axes=[2])
+        pos_ids = L.cast(pos_ids, 'int64')
+        pos_ids.stop_gradient = True
+        input_mask.stop_gradient = True
+        task_ids = L.zeros_like(src_ids) + self.hparam.task_id #this shit wont use at the moment
+        task_ids.stop_gradient = True
+        model = ErnieModel(
+            src_ids=src_ids,
+            position_ids=pos_ids,
+            sentence_ids=sent_ids,
+            task_ids=task_ids,
+            input_mask=input_mask,
+            config=self.hparam,
+            use_fp16=self.hparam['use_fp16']
+        )
+        enc_out = model.get_sequence_output()
+        logits = L.fc(
+            input=enc_out,
+            size=self.num_label,
+            num_flatten_dims=2,
+            param_attr= F.ParamAttr(
+                name="cls_seq_label_out_w",
+                initializer= F.initializer.TruncatedNormal(scale=0.02)),
+            bias_attr=F.ParamAttr(
+                name="cls_seq_label_out_b",
+                initializer=F.initializer.Constant(0.)))
+        propeller.summary.histogram('pred', logits)
+        return logits, input_seqlen
+    def loss(self, predictions, labels):
+        logits, input_seqlen = predictions
+        logits = L.flatten(logits, axis=2)
+        labels = L.flatten(labels, axis=2)
+        ce_loss, probs = L.softmax_with_cross_entropy(
+            logits=logits, label=labels, return_softmax=True)
+        loss = L.mean(x=ce_loss)
+        return loss
+    def backward(self, loss):
+        scheduled_lr, _ = optimization(
+            loss=loss,
+            warmup_steps=int(self.run_config.max_steps * self.hparam['warmup_proportion']),
+            num_train_steps=self.run_config.max_steps,
+            learning_rate=self.hparam['learning_rate'],
+            train_program=F.default_main_program(), 
+            startup_prog=F.default_startup_program(),
+            weight_decay=self.hparam['weight_decay'],
+            scheduler="linear_warmup_decay",)
+        propeller.summary.scalar('lr', scheduled_lr)
+    def metrics(self, predictions, label):
+        pred, seqlen = predictions
+        pred = L.argmax(pred, axis=-1)
+        pred = L.unsqueeze(pred, axes=[-1])
+        f1 = propeller.metrics.ChunkF1(label, pred, seqlen, self.num_label)
+        return {'f1': f1}
+def make_sequence_label_dataset(name, input_files, label_list, tokenizer, batch_size, max_seqlen, is_train):
+    label_map = {v: i for i, v in enumerate(label_list)}
+    no_entity_id = label_map['O']
+    delimiter = b''
+    def read_bio_data(filename):
+        ds = propeller.data.Dataset.from_file(filename)
+        iterable = iter(ds)
+        def gen():
+            buf, size = [], 0
+            iterator = iter(ds)
+            while 1:
+                line = next(iterator)
+                cols = line.rstrip(b'\n').split(b'\t')
+                tokens = cols[0].split(delimiter)
+                labels = cols[1].split(delimiter)
+                if len(cols) != 2:
+                    continue
+                if len(tokens) != len(labels) or len(tokens) == 0:
+                    continue
+                yield [tokens, labels]
+        return propeller.data.Dataset.from_generator_func(gen)
+    def reseg_token_label(dataset):
+        def gen():
+            iterator = iter(dataset)
+            while True:
+                tokens, labels = next(iterator)
+                assert len(tokens) == len(labels)
+                ret_tokens = []
+                ret_labels = []
+                for token, label in zip(tokens, labels):
+                    sub_token = tokenizer(token)
+                    label = label.decode('utf8')
+                    if len(sub_token) == 0:
+                        continue
+                    ret_tokens.extend(sub_token)
+                    ret_labels.append(label)
+                    if len(sub_token) < 2:
+                        continue
+                    sub_label = label
+                    if label.startswith("B-"):
+                        sub_label = "I-" + label[2:]
+                    ret_labels.extend([sub_label] * (len(sub_token) - 1))
+                assert len(ret_tokens) == len(ret_labels)
+                yield ret_tokens, ret_labels
+        ds = propeller.data.Dataset.from_generator_func(gen)
+        return ds
+    def convert_to_ids(dataset):
+        def gen():
+            iterator = iter(dataset)
+            while True:
+                tokens, labels = next(iterator)
+                if len(tokens) > max_seqlen - 2:
+                    tokens = tokens[: max_seqlen - 2]
+                    labels = labels[: max_seqlen - 2]
+                tokens = ['[CLS]'] + tokens + ['[SEP]']
+                token_ids = [vocab[t] for t in tokens]
+                label_ids = [no_entity_id] + [label_map[x] for x in labels] + [no_entity_id]
+                token_type_ids = [0] * len(token_ids)
+                input_seqlen = len(token_ids)
+                token_ids = np.array(token_ids, dtype=np.int64)
+                label_ids = np.array(label_ids, dtype=np.int64)
+                token_type_ids = np.array(token_type_ids, dtype=np.int64)
+                input_seqlen = np.array(input_seqlen, dtype=np.int64)
+                yield token_ids, token_type_ids, input_seqlen, label_ids
+        ds = propeller.data.Dataset.from_generator_func(gen)
+        return ds
+    def after(*features):
+        return utils.data.expand_dims(*features)
+    dataset = propeller.data.Dataset.from_list(input_files)
+    if is_train:
+        dataset = dataset.repeat().shuffle(buffer_size=len(input_files))
+    dataset = dataset.interleave(map_fn=read_bio_data, cycle_length=len(input_files), block_length=1)
+    if is_train:
+        dataset = dataset.shuffle(buffer_size=100)
+    dataset = reseg_token_label(dataset)
+    dataset = convert_to_ids(dataset)
+    dataset = dataset.padded_batch(batch_size).map(after)
+    dataset.name = name
+    return dataset
+def make_sequence_label_dataset_from_stdin(name, tokenizer, batch_size, max_seqlen):
+    delimiter = b''
+    def stdin_gen():
+        if six.PY3:
+            source = sys.stdin.buffer 
+        else:
+            source = sys.stdin
+        while True:
+            line = source.readline()
+            if len(line) == 0:
+                break
+            yield line,
+    def read_bio_data(ds):
+        iterable = iter(ds)
+        def gen():
+            buf, size = [], 0
+            iterator = iter(ds)
+            while 1:
+                line, = next(iterator)
+                cols = line.rstrip(b'\n').split(b'\t')
+                tokens = cols[0].split(delimiter)
+                if len(cols) != 1:
+                    continue
+                if len(tokens) == 0:
+                    continue
+                yield tokens, 
+        return propeller.data.Dataset.from_generator_func(gen)
+    def reseg_token_label(dataset):
+        def gen():
+            iterator = iter(dataset)
+            while True:
+                tokens, = next(iterator)
+                ret_tokens = []
+                for token in tokens:
+                    sub_token = tokenizer(token)
+                    if len(sub_token) == 0:
+                        continue
+                    ret_tokens.extend(sub_token)
+                    if len(sub_token) < 2:
+                        continue
+                yield ret_tokens, 
+        ds = propeller.data.Dataset.from_generator_func(gen)
+        return ds
+    def convert_to_ids(dataset):
+        def gen():
+            iterator = iter(dataset)
+            while True:
+                tokens, = next(iterator)
+                if len(tokens) > max_seqlen - 2:
+                    tokens = tokens[: max_seqlen - 2]
+                tokens = ['[CLS]'] + tokens + ['[SEP]']
+                token_ids = [vocab[t] for t in tokens]
+                token_type_ids = [0] * len(token_ids)
+                input_seqlen = len(token_ids)
+                token_ids = np.array(token_ids, dtype=np.int64)
+                token_type_ids = np.array(token_type_ids, dtype=np.int64)
+                input_seqlen = np.array(input_seqlen, dtype=np.int64)
+                yield token_ids, token_type_ids, input_seqlen
+        ds = propeller.data.Dataset.from_generator_func(gen)
+        return ds
+    def after(*features):
+        return utils.data.expand_dims(*features)
+    dataset = propeller.data.Dataset.from_generator_func(stdin_gen)
+    dataset = read_bio_data(dataset)
+    dataset = reseg_token_label(dataset)
+    dataset = convert_to_ids(dataset)
+    dataset = dataset.padded_batch(batch_size).map(after)
+    dataset.name = name
+    return dataset
+if __name__ == '__main__':
+    parser = propeller.ArgumentParser('NER model with ERNIE')
+    parser.add_argument('--max_seqlen', type=int, default=128)
+    parser.add_argument('--data_dir', type=str, required=True)
+    parser.add_argument('--vocab_file', type=str, required=True)
+    parser.add_argument('--do_predict', action='store_true')
+    parser.add_argument('--use_sentence_piece_vocab', action='store_true')
+    parser.add_argument('--warm_start_from', type=str)
+    args = parser.parse_args()
+    run_config = propeller.parse_runconfig(args)
+    hparams = propeller.parse_hparam(args)
+    vocab = {j.strip().split('\t')[0]: i for i, j in enumerate(open(args.vocab_file, 'r', encoding='utf8'))}
+    tokenizer = utils.data.CharTokenizer(vocab, sentencepiece_style_vocab=args.use_sentence_piece_vocab)
+    sep_id = vocab['[SEP]']
+    cls_id = vocab['[CLS]']
+    unk_id = vocab['[UNK]']
+    pad_id = vocab['[PAD]']
+    label_list = ['B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'O']
+    hparams['label_list'] = label_list
+    if not args.do_predict:
+        train_data_dir = os.path.join(args.data_dir, 'train')
+        train_input_files = [os.path.join(train_data_dir, filename) for filename in os.listdir(train_data_dir)]
+        dev_data_dir = os.path.join(args.data_dir, 'dev')
+        dev_input_files = [os.path.join(dev_data_dir, filename) for filename in os.listdir(dev_data_dir)]
+        test_data_dir = os.path.join(args.data_dir, 'test')
+        test_input_files = [os.path.join(test_data_dir, filename) for filename in os.listdir(test_data_dir)]
+        train_ds = make_sequence_label_dataset(name='train', 
+                                               input_files=train_input_files, 
+                                               label_list=label_list, 
+                                               tokenizer=tokenizer, 
+                                               batch_size=hparams.batch_size, 
+                                               max_seqlen=args.max_seqlen,
+                                               is_train=True)
+        dev_ds = make_sequence_label_dataset(name='dev',
+                                             input_files=dev_input_files,
+                                             label_list=label_list,
+                                             tokenizer=tokenizer,
+                                             batch_size=hparams.batch_size,
+                                             max_seqlen=args.max_seqlen,
+                                             is_train=False)
+        test_ds = make_sequence_label_dataset(name='test',
+                                              input_files=test_input_files,
+                                              label_list=label_list,
+                                              tokenizer=tokenizer,
+                                              batch_size=hparams.batch_size,
+                                              max_seqlen=args.max_seqlen,
+                                              is_train=False)
+        shapes = ([-1, args.max_seqlen, 1], [-1, args.max_seqlen, 1], [-1, 1], [-1, args.max_seqlen, 1]) 
+        types = ('int64', 'int64', 'int64', 'int64')
+        train_ds.data_shapes = shapes
+        train_ds.data_types = types
+        dev_ds.data_shapes = shapes
+        dev_ds.data_types = types
+        test_ds.data_shapes = shapes
+        test_ds.data_types = types
+        varname_to_warmstart = re.compile(r'^encoder.*[wb]_0$|^.*embedding$|^.*bias$|^.*scale$|^pooled_fc.[wb]_0$')
+        warm_start_dir = args.warm_start_from
+        ws = propeller.WarmStartSetting(
+                predicate_fn=lambda v: varname_to_warmstart.match(v.name) and os.path.exists(os.path.join(warm_start_dir, v.name)),
+                from_dir=warm_start_dir
+            )
+        best_exporter = propeller.train.exporter.BestInferenceModelExporter(os.path.join(run_config.model_dir, 'best'), cmp_fn=lambda old, new: new['dev']['f1'] > old['dev']['f1'])
+        propeller.train.train_and_eval(
+                model_class_or_model_fn=SequenceLabelErnieModel,
+                params=hparams, 
+                run_config=run_config, 
+                train_dataset=train_ds, 
+                eval_dataset={'dev': dev_ds, 'test': test_ds}, 
+                warm_start_setting=ws, 
+                exporters=[best_exporter])
+        for k in best_exporter._best['dev'].keys():
+            if 'loss' in k:
+                continue
+            dev_v = best_exporter._best['dev'][k]
+            test_v = best_exporter._best['test'][k]
+            print('dev_%s\t%.5f\ntest_%s\t%.5f' % (k, dev_v, k, test_v))
+    else:
+        predict_ds = make_sequence_label_dataset_from_stdin(name='pred', 
+                                               tokenizer=tokenizer, 
+                                               batch_size=hparams.batch_size, 
+                                               max_seqlen=args.max_seqlen)
+        shapes = ([-1, args.max_seqlen, 1], [-1, args.max_seqlen, 1], [-1, 1]) 
+        types = ('int64', 'int64', 'int64')
+        predict_ds.data_shapes = shapes
+        predict_ds.data_types = types
+        rev_label_map = {i: v for i, v in enumerate(label_list)}
+        learner = propeller.Learner(SequenceLabelErnieModel, run_config, hparams)
+        for pred, _  in learner.predict(predict_ds, ckpt=-1):
+            pred_str = ' '.join([rev_label_map[idx] for idx in np.argmax(pred, 1).tolist()])
+            print(pred_str)
--- a/example/finetune_ranker.py
+++ b/example/finetune_ranker.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import re
+import time
+import logging
+import six
+import sys
+import io
+from random import random
+from functools import reduce, partial, wraps
+import numpy as np
+import multiprocessing
+import re
+import paddle
+import paddle.fluid as F
+import paddle.fluid.layers as L
+from model.ernie import ErnieModel
+from optimization import optimization
+import utils.data
+from propeller import log
+import propeller.paddle as propeller
+log.setLevel(logging.DEBUG)
+class RankingErnieModel(propeller.train.Model):
+    """propeller Model wraper for paddle-ERNIE """
+    def __init__(self, hparam, mode, run_config):
+        self.hparam = hparam
+        self.mode = mode
+        self.run_config = run_config
+    def forward(self, features):
+        src_ids, sent_ids, qid = features
+        zero = L.fill_constant([1], dtype='int64', value=0)
+        input_mask = L.cast(L.logical_not(L.equal(src_ids, zero)), 'float32') # assume pad id == 0
+        #input_mask = L.unsqueeze(input_mask, axes=[2])
+        d_shape = L.shape(src_ids)
+        seqlen = d_shape[1]
+        batch_size = d_shape[0]
+        pos_ids = L.unsqueeze(L.range(0, seqlen, 1, dtype='int32'), axes=[0])
+        pos_ids = L.expand(pos_ids, [batch_size, 1])
+        pos_ids = L.unsqueeze(pos_ids, axes=[2])
+        pos_ids = L.cast(pos_ids, 'int64')
+        pos_ids.stop_gradient = True
+        input_mask.stop_gradient = True
+        task_ids = L.zeros_like(src_ids) + self.hparam.task_id #this shit wont use at the moment
+        task_ids.stop_gradient = True
+        ernie = ErnieModel(
+            src_ids=src_ids,
+            position_ids=pos_ids,
+            sentence_ids=sent_ids,
+            task_ids=task_ids,
+            input_mask=input_mask,
+            config=self.hparam,
+            use_fp16=self.hparam['use_fp16']
+        )
+        cls_feats = ernie.get_pooled_output()
+        cls_feats = L.dropout(
+            x=cls_feats,
+            dropout_prob=0.1,
+            dropout_implementation="upscale_in_train"
+        )
+        logits = L.fc(
+            input=cls_feats,
+            size=self.hparam['num_label'],
+            param_attr=F.ParamAttr(
+                name="cls_out_w",
+                initializer=F.initializer.TruncatedNormal(scale=0.02)),
+            bias_attr=F.ParamAttr(
+                name="cls_out_b", initializer=F.initializer.Constant(0.))
+        )
+        propeller.summary.histogram('pred', logits)
+        if self.mode is propeller.RunMode.PREDICT:
+            probs = L.softmax(logits)
+            return qid, probs
+        else:
+            return qid, logits
+    def loss(self, predictions, labels):
+        qid, predictions = predictions
+        ce_loss, probs = L.softmax_with_cross_entropy(
+            logits=predictions, label=labels, return_softmax=True)
+        #L.Print(ce_loss, message='per_example_loss')
+        loss = L.mean(x=ce_loss)
+        return loss
+    def metrics(self, predictions, label):
+        qid, logits = predictions
+        positive_class_logits = L.slice(logits, axes=[1], starts=[1], ends=[2])
+        mrr = propeller.metrics.Mrr(qid, label, positive_class_logits)
+        predictions = L.argmax(logits, axis=1)
+        predictions = L.unsqueeze(predictions, axes=[1])
+        f1 = propeller.metrics.F1(label, predictions)
+        acc = propeller.metrics.Acc(label, predictions)
+        #auc = propeller.metrics.Auc(label, predictions)
+        return {'acc': acc, 'f1': f1, 'mrr': mrr}
+    def backward(self, loss):
+        scheduled_lr, _ = optimization(
+            loss=loss,
+            warmup_steps=int(self.run_config.max_steps * self.hparam['warmup_proportion']),
+            num_train_steps=self.run_config.max_steps,
+            learning_rate=self.hparam['learning_rate'],
+            train_program=F.default_main_program(), 
+            startup_prog=F.default_startup_program(),
+            weight_decay=self.hparam['weight_decay'],
+            scheduler="linear_warmup_decay",)
+        propeller.summary.scalar('lr', scheduled_lr)
+if __name__ == '__main__':
+    parser = propeller.ArgumentParser('ranker model with ERNIE')
+    parser.add_argument('--do_predict', action='store_true')
+    parser.add_argument('--predict_model', type=str, default=None)
+    parser.add_argument('--max_seqlen', type=int, default=128)
+    parser.add_argument('--vocab_file', type=str, required=True)
+    parser.add_argument('--data_dir', type=str, required=True)
+    parser.add_argument('--warm_start_from', type=str)
+    parser.add_argument('--sentence_piece_model', type=str, default=None)
+    parser.add_argument('--word_dict', type=str, default=None)
+    args = parser.parse_args()
+    run_config = propeller.parse_runconfig(args)
+    hparams = propeller.parse_hparam(args)
+    vocab = {j.strip().split(b'\t')[0].decode('utf8') : i for i, j in enumerate(open(args.vocab_file, 'rb'))}
+    sep_id = vocab['[SEP]']
+    cls_id = vocab['[CLS]']
+    unk_id = vocab['[UNK]']
+    if args.sentence_piece_model is not None:
+        if args.word_dict is None:
+            raise ValueError('--word_dict no specified in subword Model')
+        tokenizer = utils.data.WSSPTokenizer(args.sentence_piece_model, args.word_dict, ws=True, lower=True)
+    else:
+        tokenizer = utils.data.CharTokenizer(vocab.keys())
+    def tokenizer_func(inputs):
+        '''avoid pickle error'''
+        ret = tokenizer(inputs)
+        return ret
+    shapes = ([-1, args.max_seqlen, 1], [-1, args.max_seqlen, 1], [-1, 1], [-1, 1]) 
+    types = ('int64', 'int64', 'int64', 'int64')
+    if not args.do_predict:
+        feature_column = propeller.data.FeatureColumns([
+            propeller.data.LabelColumn('qid'),
+            propeller.data.TextColumn('title', vocab_dict=vocab, tokenizer=tokenizer_func, unk_id=unk_id),
+            propeller.data.TextColumn('comment', vocab_dict=vocab, tokenizer=tokenizer_func, unk_id=unk_id),
+            propeller.data.LabelColumn('label'),
+        ])
+        def before(qid, seg_a, seg_b, label):
+            sentence, segments = utils.data.build_2_pair(seg_a, seg_b, max_seqlen=args.max_seqlen, cls_id=cls_id, sep_id=sep_id)
+            return sentence, segments, qid, label
+        def after(sentence, segments, qid, label):
+            sentence, segments, qid, label = utils.data.expand_dims(sentence, segments, qid, label)
+            return sentence, segments, qid, label
+        train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=True, repeat=True, use_gz=False) \
+                                       .map(before) \
+                                       .padded_batch(hparams.batch_size, (0, 0, 0, 0)) \
+                                       .map(after) 
+        dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \
+                                       .map(before) \
+                                       .padded_batch(hparams.batch_size, (0, 0, 0, 0)) \
+                                       .map(after)
+        test_ds = feature_column.build_dataset('test', data_dir=os.path.join(args.data_dir, 'test'), shuffle=False, repeat=False, use_gz=False) \
+                                       .map(before) \
+                                       .padded_batch(hparams.batch_size, (0, 0, 0, 0)) \
+                                       .map(after) 
+        train_ds.data_shapes = shapes
+        train_ds.data_types = types
+        dev_ds.data_shapes = shapes
+        dev_ds.data_types = types
+        test_ds.data_shapes = shapes
+        test_ds.data_types = types
+        varname_to_warmstart = re.compile(r'^encoder.*[wb]_0$|^.*embedding$|^.*bias$|^.*scale$|^pooled_fc.[wb]_0$')
+        warm_start_dir = args.warm_start_from
+        ws = propeller.WarmStartSetting(
+                predicate_fn=lambda v: varname_to_warmstart.match(v.name) and os.path.exists(os.path.join(warm_start_dir, v.name)),
+                from_dir=warm_start_dir
+            )
+        best_exporter = propeller.train.exporter.BestInferenceModelExporter(os.path.join(run_config.model_dir, 'best'), cmp_fn=lambda old, new: new['dev']['f1'] > old['dev']['f1'])
+        propeller.train_and_eval(
+                model_class_or_model_fn=RankingErnieModel, 
+                params=hparams, 
+                run_config=run_config, 
+                train_dataset=train_ds, 
+                eval_dataset={'dev': dev_ds, 'test': test_ds}, 
+                warm_start_setting=ws, 
+                exporters=[best_exporter])
+        print('dev_mrr\t%.5f\ntest_mrr\t%.5f\ndev_f1\t%.5f\ntest_f1\t%.5f' % (
+            best_exporter._best['dev']['mrr'], best_exporter._best['test']['mrr'],
+            best_exporter._best['dev']['f1'], best_exporter._best['test']['f1'],
+        ))
+    else:
+        feature_column = propeller.data.FeatureColumns([
+            propeller.data.LabelColumn('qid'),
+            propeller.data.TextColumn('title', unk_id=unk_id, vocab_dict=vocab, tokenizer=tokenizer_func),
+            propeller.data.TextColumn('comment', unk_id=unk_id, vocab_dict=vocab, tokenizer=tokenizer_func),
+        ])
+        def before(qid, seg_a, seg_b):
+            sentence, segments = utils.data.build_2_pair(seg_a, seg_b, max_seqlen=args.max_seqlen, cls_id=cls_id, sep_id=sep_id)
+            return sentence, segments, qid
+        def after(sentence, segments, qid):
+            sentence, segments, qid = utils.data.expand_dims(sentence, segments, qid)
+            return sentence, segments, qid
+        predict_ds = feature_column.build_dataset_from_stdin('predict') \
+                               .map(before) \
+                               .padded_batch(hparams.batch_size, (0, 0, 0)) \
+                               .map(after) 
+        predict_ds.data_shapes = shapes[: -1]
+        predict_ds.data_types = types[: -1]
+        est = propeller.Learner(RankingErnieModel, run_config, hparams)
+        for qid, res in est.predict(predict_ds, ckpt=-1):
+            print('%d\t%d\t%.5f\t%.5f' % (qid[0], np.argmax(res), res[0], res[1]))
+        #for i in predict_ds:
+        #    sen = i[0]
+        #    for ss in np.squeeze(sen):
+        #        print(' '.join(map(str, ss)))
--- a/example/propeller_xnli_demo.ipynb
+++ b/example/propeller_xnli_demo.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "import numpy as np\n",
+    "import re\n",
+    "import logging\n",
+    "import json"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sys.path.append('../ernie')\n",
+    "sys.path.append('../')\n",
+    "%env CUDA_VICIBLE_DEVICES=7\n",
+    "# if CUDA_VICIBLE_DEVICES is changed, relaunch jupyter kernel to inform paddle"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import propeller.paddle as propeller\n",
+    "import paddle\n",
+    "import paddle.fluid as F\n",
+    "import paddle.fluid.layers as L\n",
+    "#import model defenition from original ERNIE\n",
+    "from model.ernie import ErnieModel\n",
+    "from tokenization import FullTokenizer\n",
+    "from optimization import optimization\n",
+    "from propeller import log\n",
+    "log.setLevel(logging.DEBUG)\n",
+    "\n",
+    "if paddle.__version__ not in ['1.5.1', '1.5.2']:\n",
+    "    raise RuntimeError('propeller works in paddle1.5.1')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "# download pretrained model&config(ernie1.0) and xnli data\n",
+    "mkdir ernie1.0_pretrained\n",
+    "if [ ! -f ernie1.0_pretrained/ERNIE_stable-1.0.1.tar.gz ]\n",
+    "then\n",
+    "    echo \"download model\"\n",
+    "    wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/ERNIE_stable-1.0.1.tar.gz -P ernie1.0_pretrained\n",
+    "fi\n",
+    "\n",
+    "if [ ! -f task_data_zh.tgz ]\n",
+    "then\n",
+    "    echo \"download data\"\n",
+    "    wget --no-check-certificate https://ernie.bj.bcebos.com/task_data_zh.tgz\n",
+    "fi\n",
+    "\n",
+    "tar xzf ernie1.0_pretrained/ERNIE_stable-1.0.1.tar.gz -C ernie1.0_pretrained\n",
+    "tar xzf task_data_zh.tgz"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#define basic training settings\n",
+    "EPOCH=3\n",
+    "BATCH=16\n",
+    "LR=5e-3\n",
+    "MAX_SEQLEN=128\n",
+    "TASK_DATA='./task_data/'\n",
+    "MODEL='./ernie1.0_pretrained/'\n",
+    "OUTPUT_DIR='./output'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!rm -rf {OUTPUT_DIR}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#skip header, and reorganize train data into ./xnli_data \n",
+    "!mkdir xnli_data\n",
+    "!mkdir xnli_data/train\n",
+    "!mkdir xnli_data/test\n",
+    "!mkdir xnli_data/dev\n",
+    "\n",
+    "def remove_header_and_save(fname_in, fname_out):\n",
+    "    with open(fname_out, 'w') as fout:\n",
+    "        buf = open(fname_in).readlines()[1:]\n",
+    "        for i in buf:\n",
+    "            fout.write(i)\n",
+    "        return len(buf)\n",
+    "train_data_size = remove_header_and_save(TASK_DATA + '/xnli/train.tsv', './xnli_data/train/part.0') \n",
+    "dev_data_size = remove_header_and_save(TASK_DATA + '/xnli/dev.tsv', './xnli_data/dev/part.0') \n",
+    "test_data_size = remove_header_and_save(TASK_DATA + '/xnli/test.tsv', './xnli_data/test/part.0') \n",
+    "print(train_data_size)\n",
+    "print(dev_data_size)\n",
+    "print(test_data_size)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer = FullTokenizer(MODEL + 'vocab.txt')\n",
+    "vocab = {j.strip().split('\\t')[0]: i for i, j in enumerate(open(MODEL + 'vocab.txt', encoding='utf8'))}\n",
+    "\n",
+    "print(tokenizer.tokenize('今天很热'))\n",
+    "print(tokenizer.tokenize('coding in paddle is cool'))\n",
+    "print(tokenizer.tokenize('[CLS]i have an pen')) # note: special token like [CLS], will be segmented, so please add these id after tokenization.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`propeller.data.FeatureColumns` defines the data schema in every data file.\n",
+    "\n",
+    "our data consist of 3 columns: seg_a, seg_b, label. with \"\\t\" as delemeter.\n",
+    "\n",
+    "`TextColumn` will do 3 things for you: \n",
+    "\n",
+    "1. tokenize input sentence with user-defined `tokenizer_func`\n",
+    "2. vocab lookup\n",
+    "3. serialize to protobuf bin file (optional)\n",
+    "\n",
+    "data file is organized into following patten:\n",
+    "\n",
+    "```script\n",
+    "./xnli_data\n",
+    "|-- dev\n",
+    "|   `-- part.0\n",
+    "|-- test\n",
+    "|   `-- part.0\n",
+    "|-- train\n",
+    "   `-- part.0\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "sep_id = vocab['[SEP]']\n",
+    "cls_id = vocab['[CLS]']\n",
+    "unk_id = vocab['[UNK]']\n",
+    "\n",
+    "label_map = {\n",
+    "    b\"contradictory\": 0,\n",
+    "    b\"contradiction\": 0,\n",
+    "    b\"entailment\": 1,\n",
+    "    b\"neutral\": 2,\n",
+    "}\n",
+    "def tokenizer_func(inputs):\n",
+    "    ret = tokenizer.tokenize(inputs) #`tokenize` will conver bytes to str, so we use a str vocab\n",
+    "    return ret\n",
+    "\n",
+    "feature_column = propeller.data.FeatureColumns([\n",
+    "    propeller.data.TextColumn('title', unk_id=unk_id, vocab_dict=vocab, tokenizer=tokenizer_func),\n",
+    "    propeller.data.TextColumn('comment', unk_id=unk_id, vocab_dict=vocab, tokenizer=tokenizer_func),\n",
+    "    propeller.data.LabelColumn('label', vocab_dict=label_map), #be careful, Columns deal with python3 bytes directly.\n",
+    "])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## trian model in propeller can be defined in 2 ways:\n",
+    "1. subclass of `propeller.train.Model` which implements:\n",
+    "    1. `__init__`           (hyper_param, mode, run_config)\n",
+    "    2. `forward`            (features) => (prediction)\n",
+    "    3. `backword`           (loss) => None\n",
+    "    4. `loss`               (predictoin) => (loss)\n",
+    "    5. `metrics` (optional) (prediction) => (dict of propeller.Metrics)\n",
+    "    \n",
+    "2. a callable takes following args:\n",
+    "    1. features\n",
+    "    2. param\n",
+    "    3. mode\n",
+    "    4. run_config(optional)\n",
+    "    \n",
+    "   and returns a propeller.ModelSpec\n",
+    "   \n",
+    "we use the subclasss approch here"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class ClassificationErnieModel(propeller.train.Model):\n",
+    "    def __init__(self, hparam, mode, run_config):\n",
+    "        self.hparam = hparam\n",
+    "        self.mode = mode\n",
+    "        self.run_config = run_config\n",
+    "\n",
+    "    def forward(self, features):\n",
+    "        src_ids, sent_ids = features\n",
+    "        dtype = 'float16' if self.hparam['use_fp16'] else 'float32'\n",
+    "        zero = L.fill_constant([1], dtype='int64', value=0)\n",
+    "        input_mask = L.cast(L.equal(src_ids, zero), dtype) # assume pad id == 0\n",
+    "        #input_mask = L.unsqueeze(input_mask, axes=[2])\n",
+    "        d_shape = L.shape(src_ids)\n",
+    "        seqlen = d_shape[1]\n",
+    "        batch_size = d_shape[0]\n",
+    "        pos_ids = L.unsqueeze(L.range(0, seqlen, 1, dtype='int32'), axes=[0])\n",
+    "        pos_ids = L.expand(pos_ids, [batch_size, 1])\n",
+    "        pos_ids = L.unsqueeze(pos_ids, axes=[2])\n",
+    "        pos_ids = L.cast(pos_ids, 'int64')\n",
+    "        pos_ids.stop_gradient = True\n",
+    "        input_mask.stop_gradient = True\n",
+    "        task_ids = L.zeros_like(src_ids) + self.hparam.task_id #this shit wont use at the moment\n",
+    "        task_ids.stop_gradient = True\n",
+    "\n",
+    "        ernie = ErnieModel(\n",
+    "            src_ids=src_ids,\n",
+    "            position_ids=pos_ids,\n",
+    "            sentence_ids=sent_ids,\n",
+    "            task_ids=task_ids,\n",
+    "            input_mask=input_mask,\n",
+    "            config=self.hparam,\n",
+    "            use_fp16=self.hparam['use_fp16']\n",
+    "        )\n",
+    "\n",
+    "        cls_feats = ernie.get_pooled_output()\n",
+    "\n",
+    "        cls_feats = L.dropout(\n",
+    "            x=cls_feats,\n",
+    "            dropout_prob=0.1,\n",
+    "            dropout_implementation=\"upscale_in_train\"\n",
+    "        )\n",
+    "\n",
+    "        logits = L.fc(\n",
+    "            input=cls_feats,\n",
+    "            size=self.hparam['num_label'],\n",
+    "            param_attr=F.ParamAttr(\n",
+    "                name=\"cls_out_w\",\n",
+    "                initializer=F.initializer.TruncatedNormal(scale=0.02)),\n",
+    "            bias_attr=F.ParamAttr(\n",
+    "                name=\"cls_out_b\", initializer=F.initializer.Constant(0.))\n",
+    "        )\n",
+    "\n",
+    "        propeller.summary.histogram('pred', logits)\n",
+    "\n",
+    "        if self.mode is propeller.RunMode.PREDICT:\n",
+    "            probs = L.softmax(logits)\n",
+    "            return probs\n",
+    "        else:\n",
+    "            return logits\n",
+    "\n",
+    "    def loss(self, predictions, labels):\n",
+    "        ce_loss, probs = L.softmax_with_cross_entropy(\n",
+    "            logits=predictions, label=labels, return_softmax=True)\n",
+    "        #L.Print(ce_loss, message='per_example_loss')\n",
+    "        loss = L.mean(x=ce_loss)\n",
+    "        return loss\n",
+    "\n",
+    "    def backward(self, loss):\n",
+    "        scheduled_lr, loss_scale = optimization(\n",
+    "            loss=loss,\n",
+    "            warmup_steps=int(self.run_config.max_steps * self.hparam['warmup_proportion']),\n",
+    "            num_train_steps=self.run_config.max_steps,\n",
+    "            learning_rate=self.hparam['learning_rate'],\n",
+    "            train_program=F.default_main_program(),\n",
+    "            startup_prog=F.default_startup_program(),\n",
+    "            weight_decay=self.hparam['weight_decay'],\n",
+    "            scheduler=\"linear_warmup_decay\",)\n",
+    "        propeller.summary.scalar('lr', scheduled_lr)\n",
+    "\n",
+    "    def metrics(self, predictions, label):\n",
+    "        predictions = L.argmax(predictions, axis=1)\n",
+    "        predictions = L.unsqueeze(predictions, axes=[1])\n",
+    "        acc = propeller.metrics.Acc(label, predictions)\n",
+    "        #auc = propeller.metrics.Auc(label, predictions)\n",
+    "        return {'acc': acc}\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# define some utility function.\n",
+    "\n",
+    "def build_2_pair(seg_a, seg_b):\n",
+    "    token_type_a = np.ones_like(seg_a, dtype=np.int64) * 0\n",
+    "    token_type_b = np.ones_like(seg_b, dtype=np.int64) * 1\n",
+    "    sen_emb = np.concatenate([[cls_id], seg_a, [sep_id], seg_b, [sep_id]], 0)\n",
+    "    token_type_emb = np.concatenate([[0], token_type_a, [0], token_type_b, [1]], 0)\n",
+    "    #seqlen = sen_emb.shape[0]\n",
+    "    #deteministic truncate\n",
+    "    sen_emb = sen_emb[0: MAX_SEQLEN]\n",
+    "    token_type_emb = token_type_emb[0: MAX_SEQLEN]\n",
+    "    return sen_emb, token_type_emb\n",
+    "\n",
+    "def expand_dims(*args):\n",
+    "    func = lambda i: np.expand_dims(i, -1)\n",
+    "    ret = [func(i) for i in args]\n",
+    "    return ret\n",
+    "\n",
+    "def before_pad(seg_a, seg_b, label):\n",
+    "    sentence, segments = build_2_pair(seg_a, seg_b)\n",
+    "    return sentence, segments, label\n",
+    "\n",
+    "def after_pad(sentence, segments, label):\n",
+    "    sentence, segments, label = expand_dims(sentence, segments, label)\n",
+    "    return sentence, segments, label"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# a `propeller.paddle.data.Dataset` is built from FeatureColumns\n",
+    "\n",
+    "train_ds = feature_column.build_dataset('train', use_gz=False, data_dir='./xnli_data/train', shuffle=True, repeat=True) \\\n",
+    "                               .map(before_pad) \\\n",
+    "                               .padded_batch(BATCH, (0, 0, 0)) \\\n",
+    "                               .map(after_pad)\n",
+    "\n",
+    "dev_ds = feature_column.build_dataset('dev', use_gz=False, data_dir='./xnli_data/dev', shuffle=False, repeat=False) \\\n",
+    "                               .map(before_pad) \\\n",
+    "                               .padded_batch(BATCH, (0, 0, 0)) \\\n",
+    "                               .map(after_pad)\n",
+    "\n",
+    "shapes = ([-1, MAX_SEQLEN, 1], [-1, MAX_SEQLEN, 1], [-1, 1])\n",
+    "types = ('int64', 'int64', 'int64')\n",
+    "train_ds.data_shapes = shapes\n",
+    "train_ds.data_types = types\n",
+    "dev_ds.data_shapes = shapes\n",
+    "dev_ds.data_types = types\n",
+    "\n",
+    "warm_start_dir = MODEL + '/params'\n",
+    "# only the encoder and embedding is loaded from pretrained model\n",
+    "varname_to_warmstart = re.compile('^encoder.*w_0$|^encoder.*b_0$|^.*embedding$|^.*bias$|^.*scale$')\n",
+    "ws = propeller.WarmStartSetting(\n",
+    "        predicate_fn=lambda v: varname_to_warmstart.match(v.name) and os.path.exists(os.path.join(warm_start_dir, v.name)),\n",
+    "        from_dir=warm_start_dir\n",
+    "    )\n",
+    "\n",
+    "# propeller will export model of highest performance, the criteria is up to you. \n",
+    "# here we pick the model with maximum evaluatoin accuracy.\n",
+    "#`BestInferenceModelExporter` is used to export serveable models\n",
+    "best_inference_exporter = propeller.train.exporter.BestInferenceModelExporter(\n",
+    "    os.path.join(OUTPUT_DIR, 'best'), \n",
+    "    cmp_fn=lambda old, new: new['eval']['acc'] > old['eval']['acc'])\n",
+    "#`BestExporter` is used to export restartable checkpoint, so that we can restore from it and check test-set accuracy.\n",
+    "best_exporter = propeller.train.exporter.BestExporter(\n",
+    "    os.path.join(OUTPUT_DIR, 'best_model'), \n",
+    "    cmp_fn=lambda old, new: new['eval']['acc'] > old['eval']['acc'])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#ERNIE1.0 config \n",
+    "ernie_config = propeller.HParams(**json.loads(open(MODEL + '/ernie_config.json').read()))\n",
+    "\n",
+    "# default term in official config\n",
+    "ernie_v2_config = propeller.HParams(**{\n",
+    "    \"sent_type_vocab_size\": None,  \n",
+    "    \"use_task_id\": False,\n",
+    "    \"task_id\": 0,\n",
+    "})\n",
+    "\n",
+    "# train schema\n",
+    "train_config = propeller.HParams(**{  \n",
+    "      \"warmup_proportion\":  0.1,\n",
+    "      \"weight_decay\": 0.01,\n",
+    "      \"use_fp16\": 0,\n",
+    "      \"learning_rate\": 0.00005,\n",
+    "      \"num_label\": 3,\n",
+    "      \"batch_size\": 32\n",
+    "})\n",
+    "\n",
+    "config = ernie_config.join(ernie_v2_config).join(train_config)\n",
+    "\n",
+    "run_config = propeller.RunConfig(\n",
+    "    model_dir=OUTPUT_DIR,\n",
+    "    max_steps=EPOCH * train_data_size / BATCH,\n",
+    "    skip_steps=10,\n",
+    "    eval_steps=1000,\n",
+    "    save_steps=1000,\n",
+    "    log_steps=10,\n",
+    "    max_ckpt=3\n",
+    ")\n",
+    "            "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Finetune and Eval"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# `train_and_eval` takes key-word args only\n",
+    "# we are now ready to train\n",
+    "hooks = [propeller.train.TqdmNotebookProgressBarHook(run_config.max_steps)] # to show the progress bar, you need to `pip install tqdm ipywidgets`\n",
+    "propeller.train_and_eval(\n",
+    "    model_class_or_model_fn=ClassificationErnieModel, #**careful**, you should pass a Class to `train_and_eval`, propeller will try to instantiate it.\n",
+    "    params=config, \n",
+    "    run_config=run_config, \n",
+    "    train_dataset=train_ds, \n",
+    "    eval_dataset=dev_ds, \n",
+    "    warm_start_setting=ws, \n",
+    "    exporters=[best_exporter, best_inference_exporter],\n",
+    "    train_hooks=hooks,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Predict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# after training you might want to check your model performance on test-set\n",
+    "# let's do this via `propeller.predict`\n",
+    "# keep in mind that model of best performace has been exported during thet `train_and_eval` phrase\n",
+    "\n",
+    "best_filename = [file for file in os.listdir(os.path.join(OUTPUT_DIR, 'best_model')) if 'model' in file][0]\n",
+    "best_model_path = os.path.join(os.path.join(OUTPUT_DIR, 'best_model'), best_filename)\n",
+    "true_label = [label_map[(line.strip().split(b'\\t')[-1])]for line in open('./xnli_data/test/part.0', 'rb')]\n",
+    "\n",
+    "def drop_label(sentence, segments, label): #we drop the label column here\n",
+    "    return sentence, segments\n",
+    "\n",
+    "test_ds = feature_column.build_dataset('test', use_gz=False, data_dir='./xnli_data/test', shuffle=False, repeat=False) \\\n",
+    "                               .map(before_pad) \\\n",
+    "                               .padded_batch(BATCH, (0, 0, 0)) \\\n",
+    "                               .map(after_pad) \\\n",
+    "                               .map(drop_label)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result = []\n",
+    "learner = propeller.Learner(ClassificationErnieModel, run_config, params=config, )\n",
+    "for pred in learner.predict(test_ds, ckpt=-1):\n",
+    "    result.append(np.argmax(pred))\n",
+    "    \n",
+    "result, true_label = np.array(result), np.array(true_label)\n",
+    "\n",
+    "test_acc = (result == true_label).sum() / len(true_label)\n",
+    "print('test accuracy:%.5f' % test_acc)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Serving\n",
+    "your model is now ready to serve! \n",
+    "you can open up a server by propeller with \n",
+    "```script\n",
+    "python -m propeller.tools.start_server -m /path/to/saved/model -p 8888\n",
+    "```\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/inference/README.md
+++ b/inference/README.md
+# ERNIE fast inference (C++)
+ERNIE C++ fast inference API提供了一种更为高效的在线预测方案，可以直接联编译至生产环境以获取更好的性能。
+其实现基于[fluid inference](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_usage/deploy/inference/native_infer.html).
+**请确保您的 fluid inference 版本高于 1.6.3 以获得正确的预测结果。**
+本页面提供了一个ERNIE C++ fast inference 的 demo benchmark.
+## 准备工作
+demo 数据取自XNLI数据集test集合，位于./data 中。采用明文id格式，一行代表一个 batch, 包含四个字段：
+```text
+src_ids, pos_ids, sent_ids, self_attn_mask
+```
+字段之间按照分号(;)分隔；各字段内部包含 `shape` 和 `data` 两部分，按照冒号(:)分隔； `shape` 和 `data` 内部按空格分隔；`self_attn_mask` 为 FLOAT32 类型，其余字段为 INT64 类型。
+ERNIE fast inference 需要输入 inference\_model 格式的模型，可以参考[这里](../README.zh.md#生成inference_model)生成 inference\_model .
+**使用propeller产出的 inference\_model 只需要`src_ids`，`sent_ids` 两个字段，因此需要适当修改数据文件**
+## 编译和运行
+为了编译本 demo，c++ 编译器需要支持 C++11 标准。
+下载对应的 [fluid_inference库](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/advanced_usage/deploy/inference/build_and_install_lib_cn.html) , 根据使用的 paddle 的版本和配置状况 (是否使用 avx, mkl, 以及 cuda, cudnn 版本) 选择下载对应的版本并解压，会得到 `fluid_inference` 文件夹，将其放在与`inference.cc`同一级目录。
+用以下命令编译：
+``` bash
+cd ./gpu # cd ./cpu
+mkdir build
+cd build
+cmake ..
+make
+```
+用以下命令运行：
+```
+./run.sh ../data/sample /path/to/inference_mode_dir
+```
+## 性能测试
+测试样本：XNLI test集合，输入BatchSize=1, SequenceLength=128.
+重复5遍取平均值。
+| 测试环境 | 延迟(ms) |
+| ----- | -----    |
+| CPU（Intel(R) Xeon(R) Gold 5117 CPU @ 2.00GHz (20 线程)） | 29.8818|
+| GPU （P4）  | 8.5 |
--- a/inference/cpu/CMakeLists.txt
+++ b/inference/cpu/CMakeLists.txt
+CMAKE_MINIMUM_REQUIRED(VERSION 3.2)
+PROJECT(inference_demo)
+SET(CMAKE_C_COMPILER gcc)
+SET(CMAKE_CXX_COMPILER g++)
+ADD_COMPILE_OPTIONS(-std=c++11 -g)
+SET(FLUID_INFER_LIB fluid_inference)
+SET(FLUID_INC_PATH ${FLUID_INFER_LIB}/paddle/include)
+SET(FLUID_LIB_PATH ${FLUID_INFER_LIB}/paddle/lib)
+SET(GLOG_INC_PATH ${FLUID_INFER_LIB}/third_party/install/glog/include)
+SET(GLOG_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/glog/lib)
+SET(GFLAGS_INC_PATH ${FLUID_INFER_LIB}/third_party/install/gflags/include)
+SET(GFLAGS_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/gflags/lib)
+SET(MKLDNN_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/mkldnn/lib)
+SET(MKLML_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/mklml/lib)
+INCLUDE_DIRECTORIES(${FLUID_INC_PATH})
+INCLUDE_DIRECTORIES(${GLOG_INC_PATH})
+INCLUDE_DIRECTORIES(${GFLAGS_INC_PATH})
+LINK_DIRECTORIES(${FLUID_LIB_PATH})
+LINK_DIRECTORIES(${GLOG_LIB_PATH})
+LINK_DIRECTORIES(${GFLAGS_LIB_PATH})
+LINK_DIRECTORIES(${MKLML_LIB_PATH})
+LINK_DIRECTORIES(${MKLDNN_LIB_PATH})
+ADD_EXECUTABLE(inference inference.cc)
+TARGET_LINK_LIBRARIES(inference dl paddle_fluid glog gflags pthread)
--- a/inference/cpu/inference.cc
+++ b/inference/cpu/inference.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <paddle_inference_api.h>
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include <numeric>
+#include <sstream>
+#include <string>
+#include <vector>
+DEFINE_string(model_dir, "", "model directory");
+DEFINE_string(data, "", "input data path");
+DEFINE_int32(repeat, 1, "repeat");
+DEFINE_bool(output_prediction, false, "Whether to output the prediction results.");
+DEFINE_bool(use_gpu, false, "Whether to use GPU for prediction.");
+DEFINE_int32(device, 0, "device.");
+template <typename T>
+void GetValueFromStream(std::stringstream *ss, T *t) {
+  (*ss) >> (*t);
+}
+template <>
+void GetValueFromStream<std::string>(std::stringstream *ss, std::string *t) {
+  *t = ss->str();
+}
+// Split string to vector
+template <typename T>
+void Split(const std::string &line, char sep, std::vector<T> *v) {
+  std::stringstream ss;
+  T t;
+  for (auto c : line) {
+    if (c != sep) {
+      ss << c;
+    } else {
+      GetValueFromStream<T>(&ss, &t);
+      v->push_back(std::move(t));
+      ss.str({});
+      ss.clear();
+    }
+  }
+  if (!ss.str().empty()) {
+    GetValueFromStream<T>(&ss, &t);
+    v->push_back(std::move(t));
+    ss.str({});
+    ss.clear();
+  }
+}
+template <typename T>
+constexpr paddle::PaddleDType GetPaddleDType();
+template <>
+constexpr paddle::PaddleDType GetPaddleDType<int64_t>() {
+  return paddle::PaddleDType::INT64;
+}
+template <>
+constexpr paddle::PaddleDType GetPaddleDType<float>() {
+  return paddle::PaddleDType::FLOAT32;
+}
+// Parse tensor from string
+template <typename T>
+bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) {
+  std::vector<std::string> data;
+  Split(field, ':', &data);
+  if (data.size() < 2) return false;
+  std::string shape_str = data[0];
+  std::vector<int> shape;
+  Split(shape_str, ' ', &shape);
+  std::string mat_str = data[1];
+  std::vector<T> mat;
+  Split(mat_str, ' ', &mat);
+  tensor->shape = shape;
+  auto size =
+      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) *
+      sizeof(T);
+  tensor->data.Resize(size);
+  std::copy(mat.begin(), mat.end(), static_cast<T *>(tensor->data.data()));
+  tensor->dtype = GetPaddleDType<T>();
+  return true;
+}
+// Parse input tensors from string
+bool ParseLine(const std::string &line,
+               std::vector<paddle::PaddleTensor> *tensors) {
+  std::vector<std::string> fields;
+  Split(line, ';', &fields);
+  if (fields.size() <= 2) return false;
+  tensors->clear();
+  tensors->reserve(4);
+  int i = 0;
+  // src_ids
+  paddle::PaddleTensor src_ids;
+  ParseTensor<int64_t>(fields[i++], &src_ids);
+  src_ids.name = "eval_placeholder_0";
+  tensors->push_back(src_ids);
+  // sent_ids
+  paddle::PaddleTensor sent_ids;
+  ParseTensor<int64_t>(fields[i++], &sent_ids);
+  sent_ids.name = "eval_placeholder_1";
+  tensors->push_back(sent_ids);
+  // pos_ids
+  paddle::PaddleTensor pos_ids;
+  ParseTensor<int64_t>(fields[i++], &pos_ids);
+  pos_ids.name = "eval_placeholder_2";
+  tensors->push_back(pos_ids);
+  // input_mask
+  paddle::PaddleTensor input_mask;
+  ParseTensor<float>(fields[i++], &input_mask);
+  input_mask.name = "eval_placeholder_3";
+  tensors->push_back(input_mask);
+  return true;
+}
+// Print outputs to log
+void PrintOutputs(const std::vector<paddle::PaddleTensor> &outputs) {
+  //LOG(INFO) << "example_id\tcontradiction\tentailment\tneutral";
+  for (size_t i = 0; i < outputs.front().data.length() / sizeof(float) / 3; i += 1) {
+    std::cout << static_cast<float *>(outputs[0].data.data())[3 * i] << "\t"
+         << static_cast<float *>(outputs[0].data.data())[3 * i + 1] << "\t"
+         << static_cast<float *>(outputs[0].data.data())[3 * i + 2] << std::endl;
+  }
+}
+bool LoadInputData(std::vector<std::vector<paddle::PaddleTensor>> *inputs) {
+  if (FLAGS_data.empty()) {
+    LOG(ERROR) << "please set input data path";
+    return false;
+  }
+  std::ifstream fin(FLAGS_data);
+  std::string line;
+  int lineno = 0;
+  while (std::getline(fin, line)) {
+    std::vector<paddle::PaddleTensor> feed_data;
+    if (!ParseLine(line, &feed_data)) {
+      LOG(ERROR) << "Parse line[" << lineno << "] error!";
+    } else {
+      inputs->push_back(std::move(feed_data));
+    }
+  }
+  return true;
+}
+// ernie inference demo
+// Options:
+//     --model_dir: ernie model file directory
+//     --data: data path
+//     --repeat: repeat num
+//     --use_gpu: use gpu
+int main(int argc, char *argv[]) {
+  google::InitGoogleLogging(*argv);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_model_dir.empty()) {
+    LOG(ERROR) << "please set model dir";
+    return -1;
+  }
+  paddle::AnalysisConfig config;
+  config.SetModel(FLAGS_model_dir);
+  config.DisableGpu();
+  config.SwitchIrOptim();
+  config.EnableMKLDNN();
+  config.SetCpuMathLibraryNumThreads(20);
+  //config.EnableMemoryOptim();
+  auto predictor = CreatePaddlePredictor(config);
+  std::vector<std::vector<paddle::PaddleTensor>> inputs;
+  if (!LoadInputData(&inputs)) {
+    LOG(ERROR) << "load input data error!";
+    return -1;
+  }
+  std::vector<paddle::PaddleTensor> fetch;
+  int total_time{0};
+  // auto predict_timer = []()
+  int num_samples{0};
+  int count{0};
+  for (int i = 0; i < FLAGS_repeat; i++) {
+    for (auto feed : inputs) {
+      fetch.clear();
+      auto start = std::chrono::system_clock::now();
+      predictor->Run(feed, &fetch);
+      if (FLAGS_output_prediction && i == 0) {
+        PrintOutputs(fetch);
+      }
+      auto end = std::chrono::system_clock::now();
+      count += 1;
+      if (!fetch.empty()) {
+        total_time +=
+            std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
+                .count();
+        //num_samples += fetch.front().data.length() / 2 / sizeof(float);
+        num_samples += fetch.front().data.length() / (sizeof(float) * 2);
+      }
+    }
+  }
+  auto per_sample_ms =
+      static_cast<float>(total_time) / num_samples;
+  LOG(INFO) << "Run " << num_samples
+            << " samples, average latency: " << per_sample_ms
+            << "ms per sample.";
+  LOG(INFO) << count;
+  return 0;
+}
--- a/inference/cpu/run.sh
+++ b/inference/cpu/run.sh
+set -x
+(($# != 2)) && echo "${0} data model" && exit -1
+export LD_LIBRARY_PATH=fluid_inference/third_party/install/mkldnn/lib:fluid_inference/third_party/install/mklml/lib:fluid_inference/paddle/lib/:/home/work/cuda-9.0/lib64/:/home/work/cudnn/cudnn_v7_3_1_cuda9.0/lib64/:$LD_LIBRARY_PATH \
+./build/inference --logtostderr \
+    --model_dir $2 \
+    --data $1 \
+    --repeat 5 \
+    --output_prediction true \
+    --use_gpu true \
+    --device 0 \
--- a/inference/data/sample
+++ b/inference/data/sample
--- a/inference/gpu/CMakeLists.txt
+++ b/inference/gpu/CMakeLists.txt
+CMAKE_MINIMUM_REQUIRED(VERSION 3.2)
+PROJECT(inference_demo)
+SET(CMAKE_C_COMPILER gcc)
+SET(CMAKE_CXX_COMPILER g++)
+ADD_COMPILE_OPTIONS(-std=c++11 -g)
+SET(FLUID_INFER_LIB fluid_inference)
+SET(FLUID_INC_PATH ${FLUID_INFER_LIB}/paddle/include)
+SET(FLUID_LIB_PATH ${FLUID_INFER_LIB}/paddle/lib)
+SET(GLOG_INC_PATH ${FLUID_INFER_LIB}/third_party/install/glog/include)
+SET(GLOG_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/glog/lib)
+SET(GFLAGS_INC_PATH ${FLUID_INFER_LIB}/third_party/install/gflags/include)
+SET(GFLAGS_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/gflags/lib)
+SET(MKLDNN_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/mkldnn/lib)
+SET(MKLML_LIB_PATH ${FLUID_INFER_LIB}/third_party/install/mklml/lib)
+INCLUDE_DIRECTORIES(${FLUID_INC_PATH})
+INCLUDE_DIRECTORIES(${GLOG_INC_PATH})
+INCLUDE_DIRECTORIES(${GFLAGS_INC_PATH})
+LINK_DIRECTORIES(${FLUID_LIB_PATH})
+LINK_DIRECTORIES(${GLOG_LIB_PATH})
+LINK_DIRECTORIES(${GFLAGS_LIB_PATH})
+LINK_DIRECTORIES(${MKLML_LIB_PATH})
+LINK_DIRECTORIES(${MKLDNN_LIB_PATH})
+ADD_EXECUTABLE(inference inference.cc)
+TARGET_LINK_LIBRARIES(inference dl paddle_fluid glog gflags pthread)
--- a/inference/gpu/inference.cc
+++ b/inference/gpu/inference.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <paddle_inference_api.h>
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include <numeric>
+#include <sstream>
+#include <string>
+#include <vector>
+DEFINE_string(model_dir, "", "model directory");
+DEFINE_string(data, "", "input data path");
+DEFINE_int32(repeat, 1, "repeat");
+DEFINE_bool(output_prediction, false, "Whether to output the prediction results.");
+DEFINE_bool(use_gpu, false, "Whether to use GPU for prediction.");
+DEFINE_int32(device, 0, "device.");
+template <typename T>
+void GetValueFromStream(std::stringstream *ss, T *t) {
+  (*ss) >> (*t);
+}
+template <>
+void GetValueFromStream<std::string>(std::stringstream *ss, std::string *t) {
+  *t = ss->str();
+}
+// Split string to vector
+template <typename T>
+void Split(const std::string &line, char sep, std::vector<T> *v) {
+  std::stringstream ss;
+  T t;
+  for (auto c : line) {
+    if (c != sep) {
+      ss << c;
+    } else {
+      GetValueFromStream<T>(&ss, &t);
+      v->push_back(std::move(t));
+      ss.str({});
+      ss.clear();
+    }
+  }
+  if (!ss.str().empty()) {
+    GetValueFromStream<T>(&ss, &t);
+    v->push_back(std::move(t));
+    ss.str({});
+    ss.clear();
+  }
+}
+template <typename T>
+constexpr paddle::PaddleDType GetPaddleDType();
+template <>
+constexpr paddle::PaddleDType GetPaddleDType<int64_t>() {
+  return paddle::PaddleDType::INT64;
+}
+template <>
+constexpr paddle::PaddleDType GetPaddleDType<float>() {
+  return paddle::PaddleDType::FLOAT32;
+}
+// Parse tensor from string
+template <typename T>
+bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) {
+  std::vector<std::string> data;
+  Split(field, ':', &data);
+  if (data.size() < 2) return false;
+  std::string shape_str = data[0];
+  std::vector<int> shape;
+  Split(shape_str, ' ', &shape);
+  std::string mat_str = data[1];
+  std::vector<T> mat;
+  Split(mat_str, ' ', &mat);
+  tensor->shape = shape;
+  auto size =
+      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) *
+      sizeof(T);
+  tensor->data.Resize(size);
+  std::copy(mat.begin(), mat.end(), static_cast<T *>(tensor->data.data()));
+  tensor->dtype = GetPaddleDType<T>();
+  return true;
+}
+// Parse input tensors from string
+bool ParseLine(const std::string &line,
+               std::vector<paddle::PaddleTensor> *tensors) {
+  std::vector<std::string> fields;
+  Split(line, ';', &fields);
+  if (fields.size() <= 2) return false;
+  tensors->clear();
+  tensors->reserve(4);
+  int i = 0;
+  paddle::PaddleTensor src_ids;
+  ParseTensor<int64_t>(fields[i++], &src_ids);
+  src_ids.name = "eval_placeholder_0";
+  tensors->push_back(src_ids);
+  // sent_ids
+  paddle::PaddleTensor sent_ids;
+  ParseTensor<int64_t>(fields[i++], &sent_ids);
+  sent_ids.name = "eval_placeholder_1";
+  tensors->push_back(sent_ids);
+  // pos_ids
+  paddle::PaddleTensor pos_ids;
+  ParseTensor<int64_t>(fields[i++], &pos_ids);
+  pos_ids.name = "eval_placeholder_2";
+  tensors->push_back(pos_ids);
+  // input_mask
+  paddle::PaddleTensor input_mask;
+  ParseTensor<float>(fields[i++], &input_mask);
+  input_mask.name = "eval_placeholder_3";
+  tensors->push_back(input_mask);
+  return true;
+}
+// Print outputs to log
+void PrintOutputs(const std::vector<paddle::PaddleTensor> &outputs) {
+  //LOG(INFO) << "example_id\tcontradiction\tentailment\tneutral";
+  for (size_t i = 0; i < outputs.front().data.length() / sizeof(float) / 3; i += 1) {
+    std::cout << static_cast<float *>(outputs[0].data.data())[3 * i] << "\t"
+         << static_cast<float *>(outputs[0].data.data())[3 * i + 1] << "\t"
+         << static_cast<float *>(outputs[0].data.data())[3 * i + 2] << std::endl;
+  }
+}
+bool LoadInputData(std::vector<std::vector<paddle::PaddleTensor>> *inputs) {
+  if (FLAGS_data.empty()) {
+    LOG(ERROR) << "please set input data path";
+    return false;
+  }
+  std::ifstream fin(FLAGS_data);
+  std::string line;
+  int lineno = 0;
+  while (std::getline(fin, line)) {
+    std::vector<paddle::PaddleTensor> feed_data;
+    if (!ParseLine(line, &feed_data)) {
+      LOG(ERROR) << "Parse line[" << lineno << "] error!";
+    } else {
+      inputs->push_back(std::move(feed_data));
+    }
+  }
+  return true;
+}
+// ernie inference demo
+// Options:
+//     --model_dir: ernie model file directory
+//     --data: data path
+//     --repeat: repeat num
+//     --use_gpu: use gpu
+int main(int argc, char *argv[]) {
+  google::InitGoogleLogging(*argv);
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_model_dir.empty()) {
+    LOG(ERROR) << "please set model dir";
+    return -1;
+  }
+  paddle::AnalysisConfig config;
+  config.SetModel(FLAGS_model_dir);
+  config.EnableUseGpu(100, 0);
+  config.SwitchSpecifyInputNames(true);
+  config.EnableCUDNN();
+  config.SwitchIrOptim(true);
+  config.EnableMemoryOptim();
+  auto predictor = CreatePaddlePredictor(config);
+  std::vector<std::vector<paddle::PaddleTensor>> inputs;
+  if (!LoadInputData(&inputs)) {
+    LOG(ERROR) << "load input data error!";
+    return -1;
+  }
+  std::vector<paddle::PaddleTensor> fetch;
+  int total_time{0};
+  // auto predict_timer = []()
+  int num_samples{0};
+  int count{0};
+  for (int i = 0; i < FLAGS_repeat; i++) {
+    for (auto feed : inputs) {
+      fetch.clear();
+      auto start = std::chrono::system_clock::now();
+      predictor->Run(feed, &fetch);
+      if (FLAGS_output_prediction && i == 0) {
+        PrintOutputs(fetch);
+      }
+      auto end = std::chrono::system_clock::now();
+      count += 1;
+      if (!fetch.empty()) {
+        total_time +=
+            std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
+                .count();
+        //num_samples += fetch.front().data.length() / 2 / sizeof(float);
+        num_samples += fetch.front().data.length() / (sizeof(float) * 2);
+      }
+    }
+  }
+  auto per_sample_ms =
+      static_cast<float>(total_time) / num_samples;
+  LOG(INFO) << "Run " << num_samples
+            << " samples, average latency: " << per_sample_ms
+            << "ms per sample.";
+  LOG(INFO) << count;
+  return 0;
+}
--- a/inference/gpu/run.sh
+++ b/inference/gpu/run.sh
+set -x
+(($# != 2)) && echo "${0} data model" && exit -1
+export LD_LIBRARY_PATH=fluid_inference/third_party/install/mkldnn/lib:fluid_inference/third_party/install/mklml/lib:fluid_inference/paddle/lib/:/home/work/cuda-9.0/lib64/:/home/work/cudnn/cudnn_v7_3_1_cuda9.0/lib64/:$LD_LIBRARY_PATH
+./build/inference --logtostderr \
+    --model_dir $2 \
+    --data $1 \
+    --repeat 5 \
+    --output_prediction true \
+    --use_gpu true \
+    --device 0 \
--- a/predict_classifier.py
+++ b/predict_classifier.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Load classifier's checkpoint to do prediction or save inference model."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import os
-import time
-import argparse
-import numpy as np
-import multiprocessing
-# NOTE(paddle-dev): All of these flags should be
-# set before `import paddle`. Otherwise, it would
-# not take any effect.
-os.environ['FLAGS_eager_delete_tensor_gb'] = '0'  # enable gc
-import paddle.fluid as fluid
-from reader.task_reader import ClassifyReader
-from model.ernie import ErnieConfig
-from finetune.classifier import create_model
-from utils.args import ArgumentGroup, print_arguments
-from utils.init import init_pretraining_params
-from finetune_args import parser
-# yapf: disable
-parser = argparse.ArgumentParser(__doc__)
-model_g = ArgumentGroup(parser, "model", "options to init, resume and save model.")
-model_g.add_arg("ernie_config_path",            str,  None,  "Path to the json file for bert model config.")
-model_g.add_arg("init_checkpoint",              str,  None,  "Init checkpoint to resume training from.")
-model_g.add_arg("save_inference_model_path",    str,  "inference_model",  "If set, save the inference model to this path.")
-model_g.add_arg("use_fp16",                     bool, False, "Whether to resume parameters from fp16 checkpoint.")
-model_g.add_arg("num_labels",                   int,  2,     "num labels for classify")
-model_g.add_arg("ernie_version",                str,  "1.0", "ernie_version")
-data_g = ArgumentGroup(parser, "data", "Data paths, vocab paths and data processing options.")
-data_g.add_arg("predict_set",         str,  None,  "Predict set file")
-data_g.add_arg("vocab_path",          str,  None,  "Vocabulary path.")
-data_g.add_arg("label_map_config",    str,  None,  "Label_map_config json file.")
-data_g.add_arg("max_seq_len",         int,  128,   "Number of words of the longest seqence.")
-data_g.add_arg("batch_size",          int,  32,    "Total examples' number in batch for training. see also --in_tokens.")
-data_g.add_arg("do_lower_case",       bool, True,
-               "Whether to lower case the input text. Should be True for uncased models and False for cased models.")
-run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
-run_type_g.add_arg("use_cuda",          bool,   True,  "If set, use GPU for training.")
-run_type_g.add_arg("do_prediction",     bool,   True,  "Whether to do prediction on test set.")
-args = parser.parse_args()
-# yapf: enable.
-def main(args):
-    ernie_config = ErnieConfig(args.ernie_config_path)
-    ernie_config.print_config()
-    reader = ClassifyReader(
-        vocab_path=args.vocab_path,
-        label_map_config=args.label_map_config,
-        max_seq_len=args.max_seq_len,
-        do_lower_case=args.do_lower_case,
-        in_tokens=False,
-        is_inference=True)
-    predict_prog = fluid.Program()
-    predict_startup = fluid.Program()
-    with fluid.program_guard(predict_prog, predict_startup):
-        with fluid.unique_name.guard():
-            predict_pyreader, probs, feed_target_names = create_model(
-                args,
-                pyreader_name='predict_reader',
-                ernie_config=ernie_config,
-                is_classify=True,
-                is_prediction=True,
-                ernie_version=args.ernie_version)
-    predict_prog = predict_prog.clone(for_test=True)
-    if args.use_cuda:
-        place = fluid.CUDAPlace(0)
-        dev_count = fluid.core.get_cuda_device_count()
-    else:
-        place = fluid.CPUPlace()
-        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-    place = fluid.CUDAPlace(0) if args.use_cuda == True else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    exe.run(predict_startup)
-    if args.init_checkpoint:
-        init_pretraining_params(exe, args.init_checkpoint, predict_prog)
-    else:
-        raise ValueError("args 'init_checkpoint' should be set for prediction!")
-    assert args.save_inference_model_path, "args save_inference_model_path should be set for prediction"
-    _, ckpt_dir = os.path.split(args.init_checkpoint.rstrip('/'))
-    dir_name = ckpt_dir + '_inference_model'
-    model_path = os.path.join(args.save_inference_model_path, dir_name)
-    print("save inference model to %s" % model_path)
-    fluid.io.save_inference_model(
-        model_path,
-        feed_target_names, [probs],
-        exe,
-        main_program=predict_prog)
-    print("load inference model from %s" % model_path)
-    infer_program, feed_target_names, probs = fluid.io.load_inference_model(
-            model_path, exe)
-    src_ids = feed_target_names[0]
-    sent_ids = feed_target_names[1]
-    pos_ids = feed_target_names[2]
-    input_mask = feed_target_names[3]
-    if args.ernie_version == "2.0":
-        task_ids = feed_target_names[4]
-    predict_data_generator = reader.data_generator(
-        input_file=args.predict_set,
-        batch_size=args.batch_size,
-        epoch=1,
-        shuffle=False)
-    print("-------------- prediction results --------------")
-    np.set_printoptions(precision=4, suppress=True)
-    index = 0
-    for sample in predict_data_generator():
-        src_ids_data = sample[0]
-        sent_ids_data = sample[1]
-        pos_ids_data = sample[2]
-        task_ids_data = sample[3]
-        input_mask_data = sample[4]
-        if args.ernie_version == "1.0":
-            output = exe.run(
-                infer_program,
-                feed={src_ids: src_ids_data,
-                      sent_ids: sent_ids_data,
-                      pos_ids: pos_ids_data,
-                      input_mask: input_mask_data},
-                fetch_list=probs)
-        elif args.ernie_version == "2.0":
-            output = exe.run(
-                infer_program,
-                feed={src_ids: src_ids_data,
-                      sent_ids: sent_ids_data,
-                      pos_ids: pos_ids_data,
-                      task_ids: task_ids_data,
-                      input_mask: input_mask_data},
-                fetch_list=probs)
-        else:
-            raise ValueError("ernie_version must be 1.0 or 2.0")
-        for single_result in output[0]:
-            print("example_index:{}\t{}".format(index, single_result))
-            index += 1
-if __name__ == '__main__':
-    print_arguments(args)
-    main(args)
--- a/propeller/README.en.md
+++ b/propeller/README.en.md
+[简体中文](./README.md)|English
+# Introducing Propeller
+This doc introduct Propeller, a high level paddle API for general ML, Propeller encapsulate the following actions:：
+-  training
+-  evaluation
+-  prediction
+-  export serving
+Propeller provide the following benefits:
+-   You can run Propeller-based models on a local host or on a distributed multi-server environment without changing your model. Furthermore, you can run Propeller-based models on CPUs, GPUs without recoding your model.
+-   Propeller simplify sharing implementations between model developers.
+-   Propeller do many things for you (logging, hot-start...)
+-   Propeller buids Program and PyReader or you.
+-   Propeller provide a safe distributed training loop that controls how and when to:
+    -   build the Program
+    -   initialize variables
+    -   create checkpoint files and recover from failures
+    -   save visualizable results
+## Getting Started
+```python
+    #Define model
+    class BowModel(propeller.Model):
+        def __init__(self, config, mode):
+            self.embedding = Embedding(config['emb_size'], config['vocab_size'])
+            self.fc1 = FC(config['hidden_size'])
+            self.fc2 = FC(config['hidden_size'])
+        def forward(self, features):
+            q, t = features 
+            q_emb = softsign(self.embedding(q))
+            t_emb = softsign(self.embedding(t))
+            q_emb = self.fc1(q_emb)
+            t_emb = self.fc2(t_emn)
+            prediction = dot(q_emb,  emb)
+            return prediction
+        def loss(self, predictions, label):
+            return sigmoid_cross_entropy_with_logits(predictions, label)
+        def backward(self, loss):
+            opt = AdamOptimizer(1.e-3)
+            opt.mimize(loss)
+        def metrics(self, predictions, label):
+            auc = atarshi.metrics.Auc(predictions, label)
+            return {'auc': auc}
+    # hyper param comes from files/command line prompt/env vir
+    run_config = propeller.parse_runconfig(args)
+    hparams = propeller.parse_hparam(args)
+    # Define data
+    # `FeatureColumns` helps you to organize training/evluation files.
+    feature_column = propeller.data.FeatureColumns(columns=[
+            propeller.data.TextColumn('query', vocab='./vocab'),
+            propeller.data.TextColumn('title', vocab='./vocab'),
+            propeller.data.LabelColumn('label'),
+        ])
+    train_ds = feature_column.build_dataset(data_dir='./data',  shuffle=True, repeat=True)
+    eval_ds = feature_column.build_dataset(data_dir='./data', shuffle=False, repeat=False)
+    # Start training!
+    propeller.train_and_eval(BowModel, hparams, run_config, train_ds, eval_ds)
+```
+More detail see example/toy/
+## Main Feature
+1. train_and_eval
+    according to user-specified `propeller.Model`class，initialize training model in the following 2 modes: 1. TRAIN mode 2. EVAL mode and
+    perform train_and_eval
+2. FeatureColumns
+    `FeatureColumns`is used to ogranize train data. With custmizable `Column` property, it can adaps to many ML tasks（NLP/CV...).
+    `FeatureColumns` also do the preprocessing for you (tokenization, vocab lookup, serialization, batcing etc.)
+3. Dataset
+    `FeatureColumns` generats `Dataset`，or you can call `propeller.Dataset.from_generator_func` to build your own `Dataset`.
+4. Summary
+    To trace tensor histogram in training, simply：
+```python
+    propeller.summary.histogram('loss', tensor) 
+```
+## Contributing
+1. This project is in alpha stage, any contribution is welcomed. Fill free to create a PR.
--- a/propeller/README.md
+++ b/propeller/README.md
+简体中文|[English](./README.en.md)
+# Introducing paddle-propeller
+本文档介绍propeller，一种可极大地简化机器学习编程的高阶 Paddle API。propeller 会封装下列操作：
+-   训练
+-   评估
+-   预测
+-   导出以供使用（上线）  
+Propeller 具有下列优势：
+-   您可以在本地主机上或分布式多服务器环境中运行基于 Propeller 的模型，而无需更改模型。此外，您可以在 CPU、GPU上运行基于 Propeller 的模型，而无需重新编码模型。
+-   Propeller 简化了在模型开发者之间共享实现的过程。
+-   只需关注模型实现以及数据输入，而无需关注其他辅助代码（保存、热启动、打log等）
+-   Propeller 会为您构建Program以及PyReader。
+-   Propeller 提供安全的分布式训练循环，可以控制如何以及何时：
+    -   构建Program
+    -   初始化变量
+    -   处理异常
+    -   创建检查点文件并从故障中恢复
+    -   保存可视化的摘要结果
+## Getting Started|快速开始
+```python
+    #定义训练模型
+    class BowModel(propeller.Model):
+        def __init__(self, config, mode):
+            self.embedding = Embedding(config['emb_size'], config['vocab_size'])
+            self.fc1 = FC(config['hidden_size'])
+            self.fc2 = FC(config['hidden_size']
+        def forward(self, features):
+            q, t = features 
+            q_emb = softsign(self.embedding(q))
+            t_emb = softsign(self.embedding(t))
+            q_emb = self.fc1(q_emb)
+            t_emb = self.fc2(t_emn)
+            prediction = dot(q_emb,  emb)
+            return prediction
+        def loss(self, predictions, label):
+            return sigmoid_cross_entropy_with_logits(predictions, label)
+        def backward(self, loss):
+            opt = AdamOptimizer(1.e-3)
+            opt.mimize(loss)
+        def metrics(self, predictions, label):
+            auc = atarshi.metrics.Auc(predictions, label)
+            return {'auc': auc}
+    # 超参可以来自于文件/ 环境变量/ 命令行
+    run_config = propeller.parse_runconfig(args)
+    hparams = propeller.parse_hparam(args)
+    # 定义数据： 
+    # `FeatureColumns` 用于管理训练、预测文件. 会自动进行二进制化.
+    feature_column = propeller.data.FeatureColumns(columns=[
+            propeller.data.TextColumn('query', vocab='./vocab'),
+            propeller.data.TextColumn('title', vocab='./vocab'),
+            propeller.data.LabelColumn('label'),
+        ])
+    train_ds = feature_column.build_dataset(data_dir='./data',  shuffle=True, repeat=True)
+    eval_ds = feature_column.build_dataset(data_dir='./data', shuffle=False, repeat=False)
+    # 开始训练！
+    propeller.train_and_eval(BowModel, hparams, run_config, train_ds, eval_ds)
+```
+详细详细请见example/toy/
+## 主要构件
+1. train_and_eval
+    会根据用户提供的`propeller.Model`类，实例化两种模式下的训练模型： 1. TRAIN模式 2. EVAL模式。
+    然后开始训练，同时执行评估（Evaluation）
+2. FeatureColumns
+    用`FeatureColumns`来管理训练数据. 根据自定义`Column`来适配多种ML任务（NLP/CV...).
+    `FeatureColumns`会自动对提供的训练数据进行批量预处理(tokenization, 查词表, etc.)并二进制化，并且生成训练用的dataset
+3. Dataset
+    `FeatureColumns`生成`Dataset`，或者您可以调用`propeller.Dataset.from_generator_func`来构造自己的`Dataset`，配合shuffle/ interleave/ padded_batch/ repeat 等方法满足定制化需求.
+4. Summary
+    对训练过程中的某些参数进行log追踪，只需要：
+```python
+            propeller.summary.histogram('loss', tensor) 
+```
+## Contributing|贡献
+1. 本项目处于初期阶段，欢迎贡献！
+2. functional programing is welcomed
+## TODO
+1. dataset output_types/ output_shapes 自动推断
+2. 自动超参数搜索
+3. propeller server
+4. ...
--- a/propeller/__init__.py
+++ b/propeller/__init__.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Propeller"""
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import unicode_literals
+import os
+import sys
+import logging
+import six
+from time import time
+__version__ = '0.1'
+log = logging.getLogger(__name__)
+stream_hdl = logging.StreamHandler(stream=sys.stderr)
+formatter = logging.Formatter(
+    fmt='[%(levelname)s] %(asctime)s [%(filename)12s:%(lineno)5d]:\t%(message)s'
+)
+try:
+    from colorlog import ColoredFormatter
+    fancy_formatter = ColoredFormatter(
+        fmt='%(log_color)s[%(levelname)s] %(asctime)s [%(filename)12s:%(lineno)5d]:\t%(message)s'
+    )
+    stream_hdl.setFormatter(fancy_formatter)
+except ImportError:
+    stream_hdl.setFormatter(formatter)
+log.setLevel(logging.INFO)
+log.addHandler(stream_hdl)
+log.propagate = False
+from propeller.types import *
+from propeller.util import ArgumentParser, parse_hparam, parse_runconfig, parse_file
--- a/propeller/data/__init__.py
+++ b/propeller/data/__init__.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+doc
+"""
--- a/propeller/data/functional.py
+++ b/propeller/data/functional.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Basic Dataset API"""
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import unicode_literals
+import sys
+import logging
+import os
+import itertools
+import random
+import inspect
+import multiprocessing
+from contextlib import contextmanager
+import gzip
+import struct
+import functools
+import six
+from six.moves import zip, map, filter
+import numpy as np
+from propeller.util import map_structure
+log = logging.getLogger(__name__)
+__all__ = ['Dataset']
+@contextmanager
+def _open_file(filename, format=None):
+    if format is None:
+        fd = open(filename, 'rb')
+    elif format == 'GZIP':
+        fd = gzip.open(filename, 'rb')
+    else:
+        raise ValueError('unkwon file format %s' % format)
+    yield fd
+    fd.close()
+def _open_record(filename):
+    def _gen():
+        with _open_file(filename, format='GZIP') as f:
+            while True:
+                data = f.read(struct.calcsize('i'))
+                if not len(data):
+                    raise StopIteration
+                l, = struct.unpack('i', data)
+                data = f.read(l)
+                yield data
+    return _gen
+def _shuffle_func(dataset, buffer_size):
+    def _gen():
+        buf = []
+        iterable = dataset()
+        try:
+            while len(buf) < buffer_size:
+                buf.append(next(iterable))
+            while 1:
+                i = random.randint(0, buffer_size - 1)
+                n = next(iterable)
+                yield buf[i]
+                buf[i] = n
+        except StopIteration:
+            if len(buf):
+                random.shuffle(buf)
+                for i in buf:
+                    yield i
+    return _gen
+def _interleave_func(iterable, map_fn, cycle_length, block_length):
+    def _gen():
+        ls = itertools.tee(iterable(), cycle_length)
+        buf = []
+        for i, j in enumerate(ls):
+            j = itertools.islice(j, i, None, cycle_length)
+            j = map(map_fn, j)
+            j = (jjj for jj in j for jjj in jj)  #flatten
+            buf.append(j)
+        for tup in six.moves.zip_longest(*buf):
+            for ii in (i for i in tup if i is not None):
+                yield ii
+    return _gen
+def _repeat_func(dataset, n):
+    def _gen():
+        iterable = dataset()
+        if n >= 0:
+            ret = itertools.chain(*itertools.tee(iterable, n))
+        else:
+            ret = itertools.cycle(iterable)
+        for i in ret:
+            yield i
+    return _gen
+def _filter_func(dataset, fn):
+    def _gen():
+        for i in dataset():
+            if isinstance(i, tuple) or isinstance(i, list):
+                if fn(*i) is True:
+                    yield i
+            else:
+                if fn(i) is True:
+                    yield i
+    return _gen
+def _map_func(dataset, fn):
+    def _gen():
+        for i in dataset():
+            if isinstance(i, tuple) or isinstance(i, list):
+                yield fn(*i)
+            else:
+                yield fn(i)
+    return _gen
+def _shard_func(dataset, num_shards, index):
+    def _gen():
+        iterable = dataset()
+        ret = itertools.islice(iterable, index, None, num_shards)
+        for i in ret:
+            yield i
+    return _gen
+def _take_func(dataset, count):
+    def _gen():
+        iterable = dataset()
+        ret = itertools.islice(iterable, count)
+        for i in ret:
+            yield i
+    return _gen
+def _buffered_func(dataset, size):
+    """
+    Creates a buffered data reader.
+    The buffered data reader will read and save data entries into a
+    buffer. Reading from the buffered data reader will proceed as long
+    as the buffer is not empty.
+    :param reader: the data reader to read from.
+    :type reader: callable
+    :param size: max buffer size.
+    :type size: int
+    :returns: the buffered data reader.
+    """
+    class _EndSignal(object):
+        pass
+    end = _EndSignal()
+    def _read_worker(r, q):
+        for d in r:
+            q.put(d)
+        q.put(end)
+    def _data_reader():
+        r = dataset()
+        q = multiprocessing.Queue(maxsize=size)
+        t = multiprocessing.Process(
+            target=_read_worker, args=(
+                r,
+                q, ))
+        t.daemon = True
+        t.start()
+        e = q.get()
+        while e != end:
+            yield e
+            e = q.get()
+    return _data_reader
+def _padded_batch_func(dataset, batch_size, pad_value=0, max_seqlen=None):
+    if not isinstance(batch_size, int):
+        raise ValueError('unknown batch_size: %s' % repr(batch_size))
+    def _gen():
+        iterable = dataset()
+        pad_value_t = pad_value
+        while True:
+            buf = list(itertools.islice(iterable, batch_size))
+            if not len(buf):
+                raise StopIteration
+            buf = list(zip(*buf))  # transpose
+            if type(pad_value_t) not in [list, tuple]:
+                pad_value_t = [pad_value_t] * len(buf)
+            padded = []
+            assert len(buf) == len(
+                pad_value_t), 'pad_value [%d] != element size[%d]' % (
+                    len(pad_value_t), len(buf))
+            for e, pv in zip(buf, pad_value_t):
+                elem = e[0]
+                if (not np.isscalar(elem)) and elem.shape != ():
+                    max_len = max(map(len,
+                                      e)) if max_seqlen is None else max_seqlen
+                    def _fn(i):
+                        if max_len >= len(i):
+                            return np.pad(i, [0, max_len - len(i)],
+                                          'constant',
+                                          constant_values=pv)
+                        else:
+                            return i[:max_len]
+                    e = map(_fn, e)
+                padded.append(np.stack(list(e)))
+            yield padded
+    return _gen
+class Dataset(object):
+    """Python Wrapper for PyReader"""
+    @classmethod
+    def from_generator_func(cls, _gen, data_shapes=None, data_types=None):
+        """doc"""
+        if not inspect.isgeneratorfunction(_gen):
+            raise ValueError('expect generator function, got %s' % repr(_gen))
+        def _wrapper():  #compat to py3.7
+            try:
+                for item in _gen():
+                    yield item
+            except RuntimeError as e:
+                if str(e) != 'generator raised StopIteration':
+                    raise e
+        ret = cls()
+        ret.generator = _wrapper
+        ret.data_shapes = data_shapes
+        ret.data_types = data_types
+        return ret
+    @classmethod
+    def from_file(cls, filename, format=None):
+        """doc"""
+        if os.path.getsize(filename) == 0:
+            raise RuntimeError('%s is empty' % filename)
+        def _gen():
+            with _open_file(filename, format) as f:
+                for line in f:
+                    yield line
+        ret = cls()
+        ret.generator = _gen
+        ret.data_shapes = []
+        ret.data_types = str
+        return ret
+    @classmethod
+    def from_record_file(cls, filename):
+        """doc"""
+        if os.path.getsize(filename) == 0:
+            raise RuntimeError('%s is empty' % filename)
+        _gen = _open_record(filename)
+        ret = cls()
+        ret.generator = _gen
+        ret.data_shapes = []
+        ret.data_types = str
+        return ret
+    @classmethod
+    def from_list(cls, ls):
+        """doc"""
+        if not isinstance(ls, list):
+            raise ValueError('expect list, got %s' % repr(ls))
+        def _gen():
+            for i in ls:
+                yield i
+        ret = cls()
+        ret.generator = _gen
+        ret.data_shapes = []
+        ret.data_types = str
+        return ret
+    def __init__(self):
+        self.name = None
+        self._data_shapes = None
+        self._data_types = None
+        self.generator = None
+        self.pyreader = None
+    def __repr__(self):
+        return 'Dataset: name: %s, data_shapes %s, data_types %s' % (
+            self.name, self._data_shapes, self._data_types)
+    def __eq__(self, other):
+        return self.name == other.name and \
+               self._data_shapes == other._data_shapes and \
+               self._data_types == other._data_types
+    def __iter__(self):
+        return self.generator()
+    #def __call__(self):
+    #    return self.generator()
+    def _infer_shapes_and_types(self):
+        if self.generator is not None and self.name is not None:
+            log.info('Try to infer data shapes & types from generator')
+            first_value = next(self.generator())
+            shapes, types = [], []
+            for v in first_value:
+                if not isinstance(v, np.ndarray):
+                    raise ValueError(
+                        'dataset generator should use numpy elements, got %s' %
+                        first_value)
+                shapes.append(v.shape)
+                types.append(v.dtype.name)
+            self._data_shapes = shapes
+            self._data_types = types
+            log.info('Dataset `%s` has data_shapes: %s data_types: %s' %
+                     (self.name, repr(shapes), repr(types)))
+        else:
+            raise ValueError(
+                'Try to infer data shapes or types from incomplete Dataset')
+    @property
+    def data_shapes(self):
+        """doc"""
+        if self._data_shapes is None:
+            self._infer_shapes_and_types()
+            return self._data_shapes
+        else:
+            return self._data_shapes
+    @data_shapes.setter
+    def data_shapes(self, val):
+        """doc"""
+        self._data_shapes = val
+    @property
+    def data_types(self):
+        """doc"""
+        if self._data_types is None:
+            self._infer_shapes_and_types()
+            return self._data_types
+        else:
+            return self._data_types
+    @data_types.setter
+    def data_types(self, val):
+        """doc"""
+        self._data_types = val
+    def apply(self, transform_func):
+        """apply transform func to datasets"""
+        #input_shapes = transform_func.input_shapes
+        #input_types = transform_func.input_types
+        #data_shapes = transform_func.data_shapes
+        #data_types = transform_func.data_types
+        #assert input_shapes == self._data_shapes
+        #assert input_types = self._data_types
+        ret_gen = transform_func(self.generator)
+        ret = type(self).from_generator_func(ret_gen)
+        if self.name is not None:
+            ret.name = self.name
+        #ret.data_shapes = data_shapes
+        #ret.data_types = data_types
+        return ret
+    def shuffle(self, buffer_size):
+        """doc"""
+        func = functools.partial(_shuffle_func, buffer_size=buffer_size)
+        return self.apply(func)
+    def repeat(self, n=-1):
+        """doc"""
+        func = functools.partial(_repeat_func, n=n)
+        return self.apply(func)
+    def map(self, fn):
+        """doc"""
+        func = functools.partial(_map_func, fn=fn)
+        return self.apply(func)
+    def filter(self, fn):
+        """doc"""
+        func = functools.partial(_filter_func, fn=fn)
+        return self.apply(func)
+    def shard(self, num_shards, index):
+        """doc"""
+        func = functools.partial(
+            _shard_func, num_shards=num_shards, index=index)
+        return self.apply(func)
+    def interleave(self, map_fn, cycle_length, block_length):
+        """doc"""
+        func = functools.partial(
+            _interleave_func,
+            map_fn=map_fn,
+            cycle_length=cycle_length,
+            block_length=block_length)
+        return self.apply(func)
+    def padded_batch(self, batch_size, pad_value=0, max_seqlen=None):
+        """doc"""
+        func = functools.partial(
+            _padded_batch_func,
+            batch_size=batch_size,
+            pad_value=pad_value,
+            max_seqlen=max_seqlen)
+        return self.apply(func)
+    def take(self, count=1):
+        """doc"""
+        func = functools.partial(_take_func, count=count)
+        return self.apply(func)
+    def buffered(self, size=10):
+        """doc"""
+        func = functools.partial(_buffered_func, size=size)
+        return self.apply(func)
--- a/propeller/paddle/__init__.py
+++ b/propeller/paddle/__init__.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+doc
+"""
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import unicode_literals
+import six
+from propeller.types import *
+from propeller.util import ArgumentParser, parse_hparam, parse_runconfig, parse_file
+from propeller.paddle import data
+from propeller.paddle import train
+from propeller.paddle.train import *
--- a/propeller/paddle/collection.py
+++ b/propeller/paddle/collection.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""global collections"""
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import unicode_literals
+import sys
+_global_collection = None
+class Key(object):
+    """predefine collection keys"""
+    SUMMARY_SCALAR = 1
+    SUMMARY_HISTOGRAM = 2
+    SKIP_OPTIMIZE = 3
+class Collections(object):
+    """global collections to record everything"""
+    def __init__(self):
+        self.col = {}
+    def __enter__(self):
+        global _global_collection
+        _global_collection = self
+        return self
+    def __exit__(self, err_type, err_value, trace):
+        global _global_collection
+        _global_collection = None
+    def add(self, key, val):
+        """doc"""
+        self.col.setdefault(key, []).append(val)
+    def get(self, key):
+        """doc"""
+        return self.col.get(key, None)
+def default_collection():
+    """return global collection"""
+    global _global_collection
+    if _global_collection is None:
+        _global_collection = Collections()
+    return _global_collection
--- a/propeller/paddle/data/__init__.py
+++ b/propeller/paddle/data/__init__.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+doc
+"""
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from propeller.paddle.data.functional import *
+from propeller.paddle.data.feature_column import *
--- a/propeller/paddle/data/example.proto
+++ b/propeller/paddle/data/example.proto
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// Protocol messages for describing input data Examples for machine learning
+// model training or inference.
+syntax = "proto3";
+import "propeller/paddle/data/feature.proto";
+package propeller;
+message Example {
+  Features features = 1;
+};
+message SequenceExample {
+  Features context = 1;
+  FeatureLists feature_lists = 2;
+};
--- a/propeller/paddle/data/example_pb2.py
+++ b/propeller/paddle/data/example_pb2.py
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: propeller/paddle/data/example.proto
+import sys
+_b = sys.version_info[0] < 3 and (lambda x: x) or (
+    lambda x: x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+# @@protoc_insertion_point(imports)
+_sym_db = _symbol_database.Default()
+from propeller.paddle.data import feature_pb2 as propeller_dot_paddle_dot_data_dot_feature__pb2
+DESCRIPTOR = _descriptor.FileDescriptor(
+    name='propeller/paddle/data/example.proto',
+    package='propeller',
+    syntax='proto3',
+    serialized_options=None,
+    serialized_pb=_b(
+        '\n#propeller/paddle/data/example.proto\x12\tpropeller\x1a#propeller/paddle/data/feature.proto\"0\n\x07\x45xample\x12%\n\x08\x66\x65\x61tures\x18\x01 \x01(\x0b\x32\x13.propeller.Features\"g\n\x0fSequenceExample\x12$\n\x07\x63ontext\x18\x01 \x01(\x0b\x32\x13.propeller.Features\x12.\n\rfeature_lists\x18\x02 \x01(\x0b\x32\x17.propeller.FeatureListsb\x06proto3'
+    ),
+    dependencies=[
+        propeller_dot_paddle_dot_data_dot_feature__pb2.DESCRIPTOR,
+    ])
+_EXAMPLE = _descriptor.Descriptor(
+    name='Example',
+    full_name='propeller.Example',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='features',
+            full_name='propeller.Example.features',
+            index=0,
+            number=1,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    serialized_options=None,
+    is_extendable=False,
+    syntax='proto3',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=87,
+    serialized_end=135, )
+_SEQUENCEEXAMPLE = _descriptor.Descriptor(
+    name='SequenceExample',
+    full_name='propeller.SequenceExample',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='context',
+            full_name='propeller.SequenceExample.context',
+            index=0,
+            number=1,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR),
+        _descriptor.FieldDescriptor(
+            name='feature_lists',
+            full_name='propeller.SequenceExample.feature_lists',
+            index=1,
+            number=2,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    serialized_options=None,
+    is_extendable=False,
+    syntax='proto3',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=137,
+    serialized_end=240, )
+_EXAMPLE.fields_by_name[
+    'features'].message_type = propeller_dot_paddle_dot_data_dot_feature__pb2._FEATURES
+_SEQUENCEEXAMPLE.fields_by_name[
+    'context'].message_type = propeller_dot_paddle_dot_data_dot_feature__pb2._FEATURES
+_SEQUENCEEXAMPLE.fields_by_name[
+    'feature_lists'].message_type = propeller_dot_paddle_dot_data_dot_feature__pb2._FEATURELISTS
+DESCRIPTOR.message_types_by_name['Example'] = _EXAMPLE
+DESCRIPTOR.message_types_by_name['SequenceExample'] = _SEQUENCEEXAMPLE
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+Example = _reflection.GeneratedProtocolMessageType(
+    'Example',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_EXAMPLE,
+        __module__='propeller.paddle.data.example_pb2'
+        # @@protoc_insertion_point(class_scope:propeller.Example)
+    ))
+_sym_db.RegisterMessage(Example)
+SequenceExample = _reflection.GeneratedProtocolMessageType(
+    'SequenceExample',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_SEQUENCEEXAMPLE,
+        __module__='propeller.paddle.data.example_pb2'
+        # @@protoc_insertion_point(class_scope:propeller.SequenceExample)
+    ))
+_sym_db.RegisterMessage(SequenceExample)
+# @@protoc_insertion_point(module_scope)
--- a/propeller/paddle/data/feature.proto
+++ b/propeller/paddle/data/feature.proto
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+syntax = "proto3";
+package propeller;
+message BytesList {
+  repeated bytes value = 1;
+}
+message FloatList {
+  repeated float value = 1 [packed = true];
+}
+message Int64List {
+  repeated int64 value = 1 [packed = true];
+}
+message Feature {
+  oneof kind {
+    BytesList bytes_list = 1;
+    FloatList float_list = 2;
+    Int64List int64_list = 3;
+  }
+};
+message Features {
+  map<string, Feature> feature = 1;
+};
+message FeatureList {
+  repeated Feature feature = 1;
+};
+message FeatureLists {
+  map<string, FeatureList> feature_list = 1;
+};
--- a/propeller/paddle/data/feature_column.py
+++ b/propeller/paddle/data/feature_column.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""FeatureColumns and many Column"""
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import unicode_literals
+import os
+import sys
+import struct
+from six.moves import zip, map
+import itertools
+import gzip
+from functools import partial
+import multiprocessing
+import six
+import logging
+import numpy as np
+from glob import glob
+from propeller.paddle.train import distribution
+from propeller.data.functional import _interleave_func
+from propeller.paddle.data.functional import Dataset
+from propeller.paddle.data import example_pb2, feature_pb2
+log = logging.getLogger(__name__)
+__all__ = [
+    'FeatureColumns', 'TextColumn', 'TextIDColumn', 'LabelColumn',
+    'basic_tokenizer', 'Column'
+]
+def basic_tokenizer(sen):
+    """doc"""
+    seg = sen.split(b' ')
+    seg = filter(lambda i: i != b' ', seg)
+    return seg
+class Column(object):
+    """doc"""
+    def __init__(self, name):
+        """doc"""
+        pass
+    def raw_to_proto(self, raw):
+        """doc"""
+        return feature_pb2.Feature()
+    @property
+    def output_shapes(self):
+        """doc"""
+        pass
+    @property
+    def output_types(self):
+        """doc"""
+        pass
+    def proto_to_instance(self, proto):
+        """doc"""
+        raise NotImplementedError()
+    def raw_to_instance(self, raw):
+        """doc"""
+        raise NotImplementedError()
+class LabelColumn(Column):
+    """doc"""
+    def __init__(self, name, vocab_dict=None, vocab_file=None):
+        """doc"""
+        self.name = name
+        self.vocab = None
+        if vocab_file:
+            self.vocab = {
+                j.strip(): i
+                for i, j in enumerate(open(vocab_file, 'rb').readlines())
+            }
+        if vocab_dict:
+            self.vocab = vocab_dict
+    @property
+    def output_shapes(self):
+        """doc"""
+        return [1]
+    @property
+    def output_types(self):
+        """doc"""
+        return 'int64'
+    def raw_to_proto(self, raw):
+        """doc"""
+        if self.vocab is None:
+            ids = [int(raw)]
+        else:
+            ids = [self.vocab[raw]]
+        fe = feature_pb2.Feature(int64_list=feature_pb2.Int64List(value=ids))
+        return fe
+    def proto_to_instance(self, feature):
+        """doc"""
+        ret = np.array(feature.int64_list.value[0], dtype=np.int64)
+        return ret
+    def raw_to_instance(self, raw):
+        """doc"""
+        if self.vocab is None:
+            ids = int(raw)
+        else:
+            ids = self.vocab[raw]
+        return ids
+class TextColumn(Column):
+    """doc"""
+    def __init__(self,
+                 name,
+                 unk_id,
+                 vocab_file=None,
+                 vocab_dict=None,
+                 tokenizer=basic_tokenizer):
+        self.name = name
+        self.tokenizer = tokenizer
+        self.unk_id = unk_id
+        if not (vocab_file or vocab_dict):
+            raise ValueError('at least specify vocab_file or vocab_dict')
+        if vocab_file:
+            self.vocab = {
+                j.strip(): i
+                for i, j in enumerate(open(vocab_file, 'rb').readlines())
+            }
+        if vocab_dict:
+            self.vocab = vocab_dict
+    @property
+    def output_shapes(self):
+        """doc"""
+        return [-1]
+    @property
+    def output_types(self):
+        """doc"""
+        return 'int64'
+    def raw_to_proto(self, raw):
+        """doc"""
+        ids = [self.vocab.get(s, self.unk_id) for s in self.tokenizer(raw)]
+        fe = feature_pb2.Feature(int64_list=feature_pb2.Int64List(value=ids))
+        return fe
+    def proto_to_instance(self, feature):
+        """doc"""
+        ret = np.array(feature.int64_list.value, dtype=np.int64)
+        return ret
+    def raw_to_instance(self, raw):
+        """doc"""
+        ids = [self.vocab.get(s, self.unk_id) for s in self.tokenizer(raw)]
+        return np.array(ids, dtype=np.int64)
+class TextIDColumn(Column):
+    """doc"""
+    def __init__(self, name):
+        """doc"""
+        self.name = name
+    @property
+    def output_shapes(self):
+        """doc"""
+        return [-1]
+    @property
+    def output_types(self):
+        """doc"""
+        return 'int64'
+    def raw_to_proto(self, raw):
+        """doc"""
+        ids = [int(s) for s in raw.split(b' ')]
+        fe = feature_pb2.Feature(int64_list=feature_pb2.Int64List(value=ids))
+        return fe
+    def proto_to_instance(self, feature):
+        """doc"""
+        ret = np.array(feature.int64_list.value, dtype=np.int64)
+        return ret
+    def raw_to_instance(self, raw):
+        """doc"""
+        ret = np.array([int(i) for i in raw.split(b' ')], dtype=np.int64)
+        return ret
+def _list_files(raw_dir):
+    return [os.path.join(raw_dir, p) for p in os.listdir(raw_dir)]
+class FeatureColumns(object):
+    """A Dataset Factory object"""
+    def __init__(self, columns):
+        """doc"""
+        self._columns = columns
+    def _make_gz_dataset(self, raw_dir, gz_dir):
+        assert raw_dir or gz_dir, 'data_dir not specified when using gz mode'
+        if raw_dir is not None:
+            assert os.path.exists(raw_dir), 'raw_dir not exists: %s' % raw_dir
+            raw_file = os.listdir(raw_dir)
+        if gz_dir is None:
+            gz_dir = '%s_gz' % raw_dir.rstrip('/')
+        if not os.path.exists(gz_dir):
+            os.mkdir(gz_dir)
+        if raw_dir is not None:
+            if len(raw_file) != 0:
+                log.debug('try making gz')
+                pool = multiprocessing.Pool()
+                args = [(os.path.join(raw_dir, f), os.path.join(gz_dir, f),
+                         self._columns, b'\t') for f in raw_file]
+                pool.map(_make_gz, args)
+                pool.terminate()
+            else:
+                assert len(
+                    os.listdir(gz_dir)
+                ) != 0, 'cant find gz file or raw-txt file at [%s] and [%s]' % (
+                    raw_dir, gz_dir)
+        return gz_dir
+    def _read_gz_dataset(self,
+                         gz_files,
+                         shuffle=False,
+                         repeat=True,
+                         shard=False,
+                         **kwargs):
+        if len(gz_files) == 0:
+            raise ValueError('reading gz from empty file list: %s' % gz_files)
+        log.info('reading gz from %s' % '\n'.join(gz_files))
+        dataset = Dataset.from_list(gz_files)
+        if repeat:
+            dataset = dataset.repeat()
+        if shard and distribution.status.mode == distribution.DistributionMode.NCCL:
+            log.info('Apply dataset sharding in distribution env')
+            train_ds = train_ds.shard(distribution.status.num_replica,
+                                      distribution.status.replica_id)
+        if shuffle:
+            dataset = dataset.shuffle(buffer_size=len(gz_files))
+        fn = partial(
+            _interleave_func,
+            map_fn=lambda filename: Dataset.from_record_file(filename),
+            cycle_length=len(gz_files),
+            block_length=1)
+        dataset = dataset.apply(fn)
+        if shuffle:
+            dataset = dataset.shuffle(buffer_size=1000)
+        def _parse_gz(record_str):  # function that takes python_str as input
+            ex = example_pb2.Example()
+            ex.ParseFromString(record_str)
+            ret = []
+            fea_dict = ex.features.feature
+            for c in self._columns:
+                ins = c.proto_to_instance(fea_dict[c.name])
+                ret.append(ins)
+            return ret
+        dataset = dataset.map(_parse_gz)
+        return dataset
+    def _read_txt_dataset(self,
+                          data_files,
+                          shuffle=False,
+                          repeat=True,
+                          **kwargs):
+        log.info('reading raw files from %s' % '\n'.join(data_files))
+        dataset = Dataset.from_list(data_files)
+        if repeat:
+            dataset = dataset.repeat()
+        if shuffle:
+            dataset = dataset.shuffle(buffer_size=len(data_files))
+        fn = partial(
+            _interleave_func,
+            map_fn=lambda filename: Dataset.from_file(filename),
+            cycle_length=len(data_files),
+            block_length=1)
+        dataset = dataset.apply(fn)
+        if shuffle:
+            dataset = dataset.shuffle(buffer_size=1000)
+        def _parse_txt_file(
+                record_str):  # function that takes python_str as input
+            features = record_str.strip(b'\n').split(b'\t')
+            ret = [
+                column.raw_to_instance(feature)
+                for feature, column in zip(features, self._columns)
+            ]
+            return ret
+        dataset = dataset.map(_parse_txt_file)
+        return dataset
+    def _read_stdin_dataset(self, encoding='utf8', shuffle=False, **kwargs):
+        log.info('reading raw files stdin')
+        def _gen():
+            if six.PY3:
+                source = sys.stdin.buffer
+            else:
+                source = sys.stdin
+            while True:
+                line = source.readline()
+                if len(line) == 0:
+                    break
+                yield line,
+        dataset = Dataset.from_generator_func(_gen)
+        if shuffle:
+            dataset = dataset.shuffle(buffer_size=1000)
+        def _parse_stdin(record_str):
+            """function that takes python_str as input"""
+            features = record_str.strip(b'\n').split(b'\t')
+            ret = [
+                column.raw_to_instance(feature)
+                for feature, column in zip(features, self._columns)
+            ]
+            return ret
+        dataset = dataset.map(_parse_stdin)
+        return dataset
+    def _prepare_dataset(self,
+                         dataset,
+                         map_func_before_batch=None,
+                         map_func_after_batch=None,
+                         shuffle_buffer_size=None,
+                         batch_size=1,
+                         pad_id=0,
+                         prefetch=None,
+                         **kwargs):
+        if map_func_before_batch is not None:
+            dataset = dataset.map(map_func_before_batch)
+        if batch_size:
+            dataset = dataset.padded_batch(batch_size, pad_id)
+        if map_func_after_batch is not None:
+            dataset = dataset.map(map_func_after_batch)
+        return dataset
+    def build_dataset(self,
+                      name,
+                      use_gz=True,
+                      data_dir=None,
+                      gz_dir=None,
+                      data_file=None,
+                      **kwargs):
+        """
+        build `Dataset` from `data_dir` or `data_file`
+        if `use_gz`, will try to convert data_files to gz format and save to `gz_dir`, if `gz_dir` not given, will create one.
+        """
+        if use_gz:
+            gz_dir = self._make_gz_dataset(data_dir, gz_dir)
+            gz_files = _list_files(gz_dir) if gz_dir is not None else gz_dir
+            ds = self._read_gz_dataset(gz_files, **kwargs)
+        else:
+            if data_dir is not None:
+                data_files = _list_files(data_dir)
+            elif data_file is not None:
+                data_files = [data_file]
+            else:
+                raise ValueError('data_dir or data_files not specified')
+            ds = self._read_txt_dataset(data_files, **kwargs)
+        ds.name = name
+        return ds
+    def build_dataset_from_stdin(self, name, **kwargs):
+        """doc"""
+        ds = self._read_stdin_dataset(**kwargs)
+        ds.name = name
+        return ds
+def _make_gz(args):
+    try:
+        from_file, to_file, columns, sep = args
+        if os.path.exists(to_file):
+            return
+        with open(from_file, 'rb') as fin, gzip.open(to_file, 'wb') as fout:
+            log.debug('making gz %s => %s' % (from_file, to_file))
+            for i, line in enumerate(fin):
+                line = line.strip(b'\n').split(sep)
+                #if i % 10000 == 0:
+                #    log.debug('making gz %s => %s [%d]' % (from_file, to_file, i))
+                if len(line) != len(columns):
+                    log.error('columns not match at %s, got %d, expect %d' %
+                              (from_file, len(line), len(columns)))
+                    continue
+                features = {}
+                for l, c in zip(line, columns):
+                    features[c.name] = c.raw_to_proto(l)
+                example = example_pb2.Example(features=feature_pb2.Features(
+                    feature=features))
+                serialized = example.SerializeToString()
+                l = len(serialized)
+                data = struct.pack('i%ds' % l, l, serialized)
+                fout.write(data)
+            log.debug('done making gz %s => %s' % (from_file, to_file))
+    except Exception as e:
+        log.exception(e)
+        raise e
--- a/propeller/paddle/data/feature_pb2.py
+++ b/propeller/paddle/data/feature_pb2.py
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: propeller/paddle/data/feature.proto
+import sys
+_b = sys.version_info[0] < 3 and (lambda x: x) or (
+    lambda x: x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+# @@protoc_insertion_point(imports)
+_sym_db = _symbol_database.Default()
+DESCRIPTOR = _descriptor.FileDescriptor(
+    name='propeller/paddle/data/feature.proto',
+    package='propeller',
+    syntax='proto3',
+    serialized_options=None,
+    serialized_pb=_b(
+        '\n#propeller/paddle/data/feature.proto\x12\tpropeller\"\x1a\n\tBytesList\x12\r\n\x05value\x18\x01 \x03(\x0c\"\x1e\n\tFloatList\x12\x11\n\x05value\x18\x01 \x03(\x02\x42\x02\x10\x01\"\x1e\n\tInt64List\x12\x11\n\x05value\x18\x01 \x03(\x03\x42\x02\x10\x01\"\x95\x01\n\x07\x46\x65\x61ture\x12*\n\nbytes_list\x18\x01 \x01(\x0b\x32\x14.propeller.BytesListH\x00\x12*\n\nfloat_list\x18\x02 \x01(\x0b\x32\x14.propeller.FloatListH\x00\x12*\n\nint64_list\x18\x03 \x01(\x0b\x32\x14.propeller.Int64ListH\x00\x42\x06\n\x04kind\"\x81\x01\n\x08\x46\x65\x61tures\x12\x31\n\x07\x66\x65\x61ture\x18\x01 \x03(\x0b\x32 .propeller.Features.FeatureEntry\x1a\x42\n\x0c\x46\x65\x61tureEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12!\n\x05value\x18\x02 \x01(\x0b\x32\x12.propeller.Feature:\x02\x38\x01\"2\n\x0b\x46\x65\x61tureList\x12#\n\x07\x66\x65\x61ture\x18\x01 \x03(\x0b\x32\x12.propeller.Feature\"\x9a\x01\n\x0c\x46\x65\x61tureLists\x12>\n\x0c\x66\x65\x61ture_list\x18\x01 \x03(\x0b\x32(.propeller.FeatureLists.FeatureListEntry\x1aJ\n\x10\x46\x65\x61tureListEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.propeller.FeatureList:\x02\x38\x01\x62\x06proto3'
+    ))
+_BYTESLIST = _descriptor.Descriptor(
+    name='BytesList',
+    full_name='propeller.BytesList',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='value',
+            full_name='propeller.BytesList.value',
+            index=0,
+            number=1,
+            type=12,
+            cpp_type=9,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    serialized_options=None,
+    is_extendable=False,
+    syntax='proto3',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=50,
+    serialized_end=76, )
+_FLOATLIST = _descriptor.Descriptor(
+    name='FloatList',
+    full_name='propeller.FloatList',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='value',
+            full_name='propeller.FloatList.value',
+            index=0,
+            number=1,
+            type=2,
+            cpp_type=6,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=_b('\020\001'),
+            file=DESCRIPTOR),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    serialized_options=None,
+    is_extendable=False,
+    syntax='proto3',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=78,
+    serialized_end=108, )
+_INT64LIST = _descriptor.Descriptor(
+    name='Int64List',
+    full_name='propeller.Int64List',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='value',
+            full_name='propeller.Int64List.value',
+            index=0,
+            number=1,
+            type=3,
+            cpp_type=2,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=_b('\020\001'),
+            file=DESCRIPTOR),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    serialized_options=None,
+    is_extendable=False,
+    syntax='proto3',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=110,
+    serialized_end=140, )
+_FEATURE = _descriptor.Descriptor(
+    name='Feature',
+    full_name='propeller.Feature',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='bytes_list',
+            full_name='propeller.Feature.bytes_list',
+            index=0,
+            number=1,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR),
+        _descriptor.FieldDescriptor(
+            name='float_list',
+            full_name='propeller.Feature.float_list',
+            index=1,
+            number=2,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR),
+        _descriptor.FieldDescriptor(
+            name='int64_list',
+            full_name='propeller.Feature.int64_list',
+            index=2,
+            number=3,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    serialized_options=None,
+    is_extendable=False,
+    syntax='proto3',
+    extension_ranges=[],
+    oneofs=[
+        _descriptor.OneofDescriptor(
+            name='kind',
+            full_name='propeller.Feature.kind',
+            index=0,
+            containing_type=None,
+            fields=[]),
+    ],
+    serialized_start=143,
+    serialized_end=292, )
+_FEATURES_FEATUREENTRY = _descriptor.Descriptor(
+    name='FeatureEntry',
+    full_name='propeller.Features.FeatureEntry',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='key',
+            full_name='propeller.Features.FeatureEntry.key',
+            index=0,
+            number=1,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR),
+        _descriptor.FieldDescriptor(
+            name='value',
+            full_name='propeller.Features.FeatureEntry.value',
+            index=1,
+            number=2,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    serialized_options=_b('8\001'),
+    is_extendable=False,
+    syntax='proto3',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=358,
+    serialized_end=424, )
+_FEATURES = _descriptor.Descriptor(
+    name='Features',
+    full_name='propeller.Features',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='feature',
+            full_name='propeller.Features.feature',
+            index=0,
+            number=1,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR),
+    ],
+    extensions=[],
+    nested_types=[_FEATURES_FEATUREENTRY, ],
+    enum_types=[],
+    serialized_options=None,
+    is_extendable=False,
+    syntax='proto3',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=295,
+    serialized_end=424, )
+_FEATURELIST = _descriptor.Descriptor(
+    name='FeatureList',
+    full_name='propeller.FeatureList',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='feature',
+            full_name='propeller.FeatureList.feature',
+            index=0,
+            number=1,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    serialized_options=None,
+    is_extendable=False,
+    syntax='proto3',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=426,
+    serialized_end=476, )
+_FEATURELISTS_FEATURELISTENTRY = _descriptor.Descriptor(
+    name='FeatureListEntry',
+    full_name='propeller.FeatureLists.FeatureListEntry',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='key',
+            full_name='propeller.FeatureLists.FeatureListEntry.key',
+            index=0,
+            number=1,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b("").decode('utf-8'),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR),
+        _descriptor.FieldDescriptor(
+            name='value',
+            full_name='propeller.FeatureLists.FeatureListEntry.value',
+            index=1,
+            number=2,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    serialized_options=_b('8\001'),
+    is_extendable=False,
+    syntax='proto3',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=559,
+    serialized_end=633, )
+_FEATURELISTS = _descriptor.Descriptor(
+    name='FeatureLists',
+    full_name='propeller.FeatureLists',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='feature_list',
+            full_name='propeller.FeatureLists.feature_list',
+            index=0,
+            number=1,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR),
+    ],
+    extensions=[],
+    nested_types=[_FEATURELISTS_FEATURELISTENTRY, ],
+    enum_types=[],
+    serialized_options=None,
+    is_extendable=False,
+    syntax='proto3',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=479,
+    serialized_end=633, )
+_FEATURE.fields_by_name['bytes_list'].message_type = _BYTESLIST
+_FEATURE.fields_by_name['float_list'].message_type = _FLOATLIST
+_FEATURE.fields_by_name['int64_list'].message_type = _INT64LIST
+_FEATURE.oneofs_by_name['kind'].fields.append(_FEATURE.fields_by_name[
+    'bytes_list'])
+_FEATURE.fields_by_name[
+    'bytes_list'].containing_oneof = _FEATURE.oneofs_by_name['kind']
+_FEATURE.oneofs_by_name['kind'].fields.append(_FEATURE.fields_by_name[
+    'float_list'])
+_FEATURE.fields_by_name[
+    'float_list'].containing_oneof = _FEATURE.oneofs_by_name['kind']
+_FEATURE.oneofs_by_name['kind'].fields.append(_FEATURE.fields_by_name[
+    'int64_list'])
+_FEATURE.fields_by_name[
+    'int64_list'].containing_oneof = _FEATURE.oneofs_by_name['kind']
+_FEATURES_FEATUREENTRY.fields_by_name['value'].message_type = _FEATURE
+_FEATURES_FEATUREENTRY.containing_type = _FEATURES
+_FEATURES.fields_by_name['feature'].message_type = _FEATURES_FEATUREENTRY
+_FEATURELIST.fields_by_name['feature'].message_type = _FEATURE
+_FEATURELISTS_FEATURELISTENTRY.fields_by_name[
+    'value'].message_type = _FEATURELIST
+_FEATURELISTS_FEATURELISTENTRY.containing_type = _FEATURELISTS
+_FEATURELISTS.fields_by_name[
+    'feature_list'].message_type = _FEATURELISTS_FEATURELISTENTRY
+DESCRIPTOR.message_types_by_name['BytesList'] = _BYTESLIST
+DESCRIPTOR.message_types_by_name['FloatList'] = _FLOATLIST
+DESCRIPTOR.message_types_by_name['Int64List'] = _INT64LIST
+DESCRIPTOR.message_types_by_name['Feature'] = _FEATURE
+DESCRIPTOR.message_types_by_name['Features'] = _FEATURES
+DESCRIPTOR.message_types_by_name['FeatureList'] = _FEATURELIST
+DESCRIPTOR.message_types_by_name['FeatureLists'] = _FEATURELISTS
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+BytesList = _reflection.GeneratedProtocolMessageType(
+    'BytesList',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_BYTESLIST,
+        __module__='propeller.paddle.data.feature_pb2'
+        # @@protoc_insertion_point(class_scope:propeller.BytesList)
+    ))
+_sym_db.RegisterMessage(BytesList)
+FloatList = _reflection.GeneratedProtocolMessageType(
+    'FloatList',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_FLOATLIST,
+        __module__='propeller.paddle.data.feature_pb2'
+        # @@protoc_insertion_point(class_scope:propeller.FloatList)
+    ))
+_sym_db.RegisterMessage(FloatList)
+Int64List = _reflection.GeneratedProtocolMessageType(
+    'Int64List',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_INT64LIST,
+        __module__='propeller.paddle.data.feature_pb2'
+        # @@protoc_insertion_point(class_scope:propeller.Int64List)
+    ))
+_sym_db.RegisterMessage(Int64List)
+Feature = _reflection.GeneratedProtocolMessageType(
+    'Feature',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_FEATURE,
+        __module__='propeller.paddle.data.feature_pb2'
+        # @@protoc_insertion_point(class_scope:propeller.Feature)
+    ))
+_sym_db.RegisterMessage(Feature)
+Features = _reflection.GeneratedProtocolMessageType(
+    'Features',
+    (_message.Message, ),
+    dict(
+        FeatureEntry=_reflection.GeneratedProtocolMessageType(
+            'FeatureEntry',
+            (_message.Message, ),
+            dict(
+                DESCRIPTOR=_FEATURES_FEATUREENTRY,
+                __module__='propeller.paddle.data.feature_pb2'
+                # @@protoc_insertion_point(class_scope:propeller.Features.FeatureEntry)
+            )),
+        DESCRIPTOR=_FEATURES,
+        __module__='propeller.paddle.data.feature_pb2'
+        # @@protoc_insertion_point(class_scope:propeller.Features)
+    ))
+_sym_db.RegisterMessage(Features)
+_sym_db.RegisterMessage(Features.FeatureEntry)
+FeatureList = _reflection.GeneratedProtocolMessageType(
+    'FeatureList',
+    (_message.Message, ),
+    dict(
+        DESCRIPTOR=_FEATURELIST,
+        __module__='propeller.paddle.data.feature_pb2'
+        # @@protoc_insertion_point(class_scope:propeller.FeatureList)
+    ))
+_sym_db.RegisterMessage(FeatureList)
+FeatureLists = _reflection.GeneratedProtocolMessageType(
+    'FeatureLists',
+    (_message.Message, ),
+    dict(
+        FeatureListEntry=_reflection.GeneratedProtocolMessageType(
+            'FeatureListEntry',
+            (_message.Message, ),
+            dict(
+                DESCRIPTOR=_FEATURELISTS_FEATURELISTENTRY,
+                __module__='propeller.paddle.data.feature_pb2'
+                # @@protoc_insertion_point(class_scope:propeller.FeatureLists.FeatureListEntry)
+            )),
+        DESCRIPTOR=_FEATURELISTS,
+        __module__='propeller.paddle.data.feature_pb2'
+        # @@protoc_insertion_point(class_scope:propeller.FeatureLists)
+    ))
+_sym_db.RegisterMessage(FeatureLists)
+_sym_db.RegisterMessage(FeatureLists.FeatureListEntry)
+_FLOATLIST.fields_by_name['value']._options = None
+_INT64LIST.fields_by_name['value']._options = None
+_FEATURES_FEATUREENTRY._options = None
+_FEATURELISTS_FEATURELISTENTRY._options = None
+# @@protoc_insertion_point(module_scope)
--- a/propeller/paddle/data/functional.py
+++ b/propeller/paddle/data/functional.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Pyreader based Dataset"""
+import sys
+import numpy as np
+import logging
+import paddle.fluid as F
+import paddle.fluid.layers as L
+from propeller.data.functional import Dataset as DatasetBase
+log = logging.getLogger(__name__)
+class Dataset(DatasetBase):
+    """Pyreader based Dataset"""
+    def placeholders(self):
+        """doc"""
+        if self.name is None:
+            raise ValueError('can not get feature from unnamed Dataset')
+        ret = []
+        for i, (shape,
+                types) in enumerate(zip(self.data_shapes, self.data_types)):
+            ret.append(
+                L.data(
+                    '%s_placeholder_%d' % (self.name, i),
+                    shape=shape,
+                    append_batch_size=False,
+                    dtype=types))
+        return ret
+    def features(self):
+        """start point of net building. call this in a program scope"""
+        if self.name is None:
+            raise ValueError('can not get feature from unnamed Dataset')
+        if len(self.data_shapes) != len(self.data_types):
+            raise ValueError(
+                'Dataset shapes and types not match: shape:%s types%s' %
+                (repr(self._data_shapes), repr(self._data_types)))
+        return self.placeholders()
+    def start(self, places=None):
+        """start Pyreader"""
+        if places is None:
+            places = F.cuda_places() if F.core.is_compiled_with_cuda(
+            ) else F.cpu_places()
+        #assert self.pyreader is not None, 'use Dataset.features to build net first, then start dataset'
+        def _gen():
+            try:
+                for idx, i in enumerate(self.generator()):
+                    yield i
+            except Exception as e:
+                log.exception(e)
+                raise e
+        r = F.io.PyReader(
+            feed_list=self.placeholders(), capacity=50, iterable=True)
+        r.decorate_batch_generator(_gen, places=places)
+        return r()
--- a/propeller/paddle/summary.py
+++ b/propeller/paddle/summary.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""record summary tensor in a collection scope"""
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import unicode_literals
+import sys
+import paddle.fluid as F
+from propeller.paddle.collection import default_collection, Key
+def scalar(name, tensor):
+    """scalar summary"""
+    if not isinstance(tensor, F.framework.Variable):
+        raise ValueError('expect paddle Variable, got %s' % repr(tensor))
+    tensor.persistable = True
+    default_collection().add(Key.SUMMARY_SCALAR, (name, tensor))
+def histogram(name, tensor):
+    """histogram summary"""
+    if not isinstance(tensor, F.framework.Variable):
+        raise ValueError('expect paddle Variable, got %s' % repr(tensor))
+    tensor.persistable = True
+    default_collection().add(Key.SUMMARY_HISTOGRAM, (name, tensor))
--- a/propeller/paddle/train/__init__.py
+++ b/propeller/paddle/train/__init__.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Propeller training"""
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import unicode_literals
+import os
+import sys
+import logging
+from time import time
+log = logging.getLogger(__name__)
+from propeller.paddle.train.monitored_executor import *
+from propeller.paddle.train.trainer import *
+from propeller.paddle.train.hooks import *
+from propeller.train.model import Model
+from propeller.paddle.train import exporter
+from propeller.paddle.train import distribution
+from propeller.paddle.train import metrics
--- a/propeller/paddle/train/distribution.py
+++ b/propeller/paddle/train/distribution.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import unicode_literals
+import functools
+import six
+import logging
+from time import sleep
+import paddle.fluid as F
+import paddle.fluid.layers as L
+log = logging.getLogger(__name__)
+import propeller.util
+__all__ = ['init_distribuition_env', 'status']
+status = None
+class DistributionMode(object):
+    LOCAL = 0
+    NCCL = 1
+class DistributionStatus(object):
+    def __init__(self, config):
+        if config is None:
+            self._mode = DistributionMode.LOCAL
+            self._env = None
+            self._this = None
+        else:
+            try:
+                self._mode = DistributionMode.NCCL
+                cluster = config['cluster']
+                task = config['task']['type']
+                idx = int(config['task']['index'])
+                self._this = cluster[task][idx]
+                self._env = cluster['chief'] + cluster['worker']
+                if len(set(self._env)) != len(self._env):
+                    raise ValueError('duplicate host in dis_config %s' %
+                                     config)
+            except KeyError as e:
+                raise ValueError(
+                    'PROPELLER_DISCONFIG wrong: %s not found in %s' %
+                    (e, repr(dis_config)))
+    @property
+    def mode(self):
+        return self._mode
+    @property
+    def num_replica(self):
+        if self._mode == DistributionMode.LOCAL:
+            return 1
+        elif self._mode == DistributionMode.NCCL:
+            return len(self._env)
+        else:
+            raise ValueError('Got unknow distribution mode %s' %
+                             repr(self._mode))
+    @property
+    def replica_id(self):
+        if self._mode == DistributionMode.LOCAL:
+            return 0
+        elif self._mode == DistributionMode.NCCL:
+            return self._env.index(self._this)
+        else:
+            raise ValueError('Got unknow distribution mode %s' %
+                             repr(self._mode))
+    @property
+    def is_master(self):
+        if self._mode == DistributionMode.LOCAL:
+            return True
+        elif self._mode == DistributionMode.NCCL:
+            return self.replica_id == 0
+        else:
+            raise ValueError('got unknow distribution mode %s' %
+                             repr(self._mode))
+dis_config = propeller.util._get_dict_from_environ_or_json_or_file(
+    None, 'PROPELLER_DISCONFIG')
+status = DistributionStatus(dis_config)
+def run_on_master(func):
+    """skip function in distribution env"""
+    @functools.wraps(func)
+    def f(*arg, **kwargs):
+        """f"""
+        if status is None:
+            raise ValueError('distribution mode unkown at this point')
+        if status.mode == DistributionMode.LOCAL:
+            r = func(*arg, **kwargs)
+        elif status.mode == DistributionMode.NCCL:
+            if status.is_master:
+                r = func(*arg, **kwargs)
+            else:
+                r = 0  # skip function
+        #MPI.COMM_WORLD.Barrier()
+        return r
+    return f
+def init_distribuition_env(program):
+    if status.mode == DistributionMode.LOCAL:
+        log.info('Initializing local training')
+    elif status.mode == DistributionMode.NCCL:
+        config = F.DistributeTranspilerConfig()
+        config.mode = "nccl2"
+        config.nccl_comm_num = 1
+        F.DistributeTranspiler(config=config).transpile(
+            status.replica_id,
+            trainers=','.join(status._env),
+            current_endpoint=status._this,
+            program=program.train_program,
+            startup_program=program.startup_program)
+        log.info('Initializing distribution training with config %s' %
+                 (repr(dis_config)))
+        if status.is_master:
+            sleep(30)
--- a/propeller/paddle/train/exporter.py
+++ b/propeller/paddle/train/exporter.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+exporters
+"""
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import unicode_literals
+import sys
+import os
+import itertools
+import six
+import inspect
+import abc
+import logging
+import numpy as np
+import paddle.fluid as F
+import paddle.fluid.layers as L
+from propeller.paddle.train import Saver
+from propeller.types import InferenceSpec
+from propeller.train.model import Model
+from propeller.paddle.train.trainer import _build_net
+from propeller.paddle.train.trainer import _build_model_fn
+from propeller.types import RunMode
+from propeller.types import ProgramPair
+log = logging.getLogger(__name__)
+@six.add_metaclass(abc.ABCMeta)
+class Exporter(object):
+    """base exporter"""
+    @abc.abstractmethod
+    def export(self, exe, program, eval_result, state):
+        """export"""
+        raise NotImplementedError()
+class BestExporter(Exporter):
+    """export saved model accordingto `cmp_fn`"""
+    def __init__(self, export_dir, cmp_fn):
+        """doc"""
+        self._export_dir = export_dir
+        self._best = None
+        self.cmp_fn = cmp_fn
+    def export(self, exe, program, eval_model_spec, eval_result, state):
+        """doc"""
+        log.debug('New evaluate result: %s \nold: %s' %
+                  (repr(eval_result), repr(self._best)))
+        if self._best is None or self.cmp_fn(old=self._best, new=eval_result):
+            log.debug('[Best Exporter]: export to %s' % self._export_dir)
+            eval_program = program.train_program
+            # FIXME: all eval datasets has same name/types/shapes now!!! so every eval program are the smae
+            saver = Saver(
+                self._export_dir,
+                exe,
+                program=eval_program,
+                max_ckpt_to_keep=1)
+            saver.save(state)
+            self._best = eval_result
+        else:
+            log.debug('[Best Exporter]: skip step %s' % state.gstep)
+class BestInferenceModelExporter(Exporter):
+    """export inference model accordingto `cmp_fn`"""
+    def __init__(self,
+                 export_dir,
+                 cmp_fn,
+                 model_class_or_model_fn=None,
+                 hparams=None,
+                 dataset=None):
+        """doc"""
+        self._export_dir = export_dir
+        self._best = None
+        self.cmp_fn = cmp_fn
+        self.model_class_or_model_fn = model_class_or_model_fn
+        self.hparams = hparams
+        self.dataset = dataset
+    def export(self, exe, program, eval_model_spec, eval_result, state):
+        """doc"""
+        if self.model_class_or_model_fn is not None and self.hparams is not None \
+                and self.dataset is not None:
+            log.info('Building program by user defined model function')
+            if issubclass(self.model_class_or_model_fn, Model):
+                _model_fn = _build_model_fn(self.model_class_or_model_fn)
+            elif inspect.isfunction(self.model_class_or_model_fn):
+                _model_fn = self.model_class_or_model_fn
+            else:
+                raise ValueError('unknown model %s' %
+                                 self.model_class_or_model_fn)
+            # build net
+            infer_program = F.Program()
+            startup_prog = F.Program()
+            with F.program_guard(infer_program, startup_prog):
+                #share var with Train net
+                with F.unique_name.guard():
+                    log.info('Building Infer Graph')
+                    infer_fea = self.dataset.features()
+                    # run_config is None
+                    self.model_spec = _build_net(_model_fn, infer_fea,
+                                                 RunMode.PREDICT, self.hparams,
+                                                 None)
+                    log.info('Done')
+            infer_program = infer_program.clone(for_test=True)
+            self.program = ProgramPair(
+                train_program=infer_program, startup_program=startup_prog)
+        else:
+            self.program = program
+            self.model_spec = eval_model_spec
+        log.debug('New evaluate result: %s \nold: %s' %
+                  (repr(eval_result), repr(self._best)))
+        if self._best is None or self.cmp_fn(old=self._best, new=eval_result):
+            log.debug('[Best Exporter]: export to %s' % self._export_dir)
+            if self.model_spec.inference_spec is None:
+                raise ValueError('model_fn didnt return InferenceSpec')
+            inf_spec_dict = self.model_spec.inference_spec
+            if not isinstance(inf_spec_dict, dict):
+                inf_spec_dict = {'inference': inf_spec_dict}
+            for inf_spec_name, inf_spec in six.iteritems(inf_spec_dict):
+                if not isinstance(inf_spec, InferenceSpec):
+                    raise ValueError('unknow inference spec type: %s' %
+                                     inf_spec)
+                save_dir = os.path.join(self._export_dir, inf_spec_name)
+                log.debug('[Best Exporter]: save inference model: "%s" to %s' %
+                          (inf_spec_name, save_dir))
+                feed_var = [i.name for i in inf_spec.inputs]
+                fetch_var = inf_spec.outputs
+                infer_program = self.program.train_program
+                startup_prog = F.Program()
+                F.io.save_inference_model(
+                    save_dir,
+                    feed_var,
+                    fetch_var,
+                    exe,
+                    main_program=infer_program)
+            self._best = eval_result
+        else:
+            log.debug('[Best Exporter]: skip step %s' % state.gstep)
--- a/propeller/paddle/train/hooks.py
+++ b/propeller/paddle/train/hooks.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""train hooks"""
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import unicode_literals
+import sys
+import six
+import os
+import itertools
+import numpy as np
+import logging
+import paddle.fluid as F
+import paddle.fluid.layers as L
+from propeller import util
+from propeller.paddle.train import distribution
+from propeller.paddle.train.metrics import Metrics
+__all__ = [
+    'RunHook', 'TqdmProgressBarHook', 'TqdmNotebookProgressBarHook',
+    'CheckpointSaverHook', 'LoggingHook', 'StopAtStepHook', 'EvalHook'
+]
+log = logging.getLogger(__name__)
+class RunHook(object):
+    """RunHook Base class"""
+    def __init__(self):
+        """doc"""
+        pass
+    def before_train(self, program):
+        """doc"""
+        pass
+    def before_run(self, state):
+        """doc"""
+        return []
+    def after_run(self, res_list, state):
+        """doc"""
+        pass
+    def should_stop(self, state):
+        """doc"""
+        return False
+    def after_train(self):
+        """doc"""
+        pass
+class TqdmProgressBarHook(RunHook):
+    """show a progress bar when training"""
+    def __init__(self, max_steps, desc=None):
+        """doc"""
+        self.tqdm = None
+        import tqdm
+        from propeller import log as main_log
+        hdl = main_log.handlers[0]
+        class _TqdmLogginHandler(logging.Handler):
+            def emit(self, record):
+                """doc"""
+                try:
+                    msg = self.format(record)
+                    tqdm.tqdm.write(msg, file=sys.stderr)
+                    self.flush()
+                except (KeyboardInterrupt, SystemExit) as e:
+                    raise e
+                except:
+                    self.handleError(record)
+        tqdm_hdl = _TqdmLogginHandler()
+        tqdm_hdl.setFormatter(hdl.formatter)
+        main_log.removeHandler(hdl)
+        main_log.addHandler(tqdm_hdl)
+        self.tqdm = tqdm.tqdm(total=max_steps, desc=None)
+    def before_run(self, state):
+        self.tqdm.n = state.gstep
+        return []
+    def __del__(self):
+        if self.tqdm:
+            self.tqdm.close()
+class TqdmNotebookProgressBarHook(RunHook):
+    """show a progress bar when training"""
+    def __init__(self, max_steps, desc=None):
+        """doc"""
+        self.tqdm = None
+        import tqdm
+        from propeller import log as main_log
+        hdl = main_log.handlers[0]
+        class _TqdmLogginHandler(logging.Handler):
+            def emit(self, record):
+                """doc"""
+                try:
+                    msg = self.format(record)
+                    tqdm.tqdm.write(msg, file=sys.stderr)
+                    self.flush()
+                except (KeyboardInterrupt, SystemExit) as e:
+                    raise e
+                except:
+                    self.handleError(record)
+        tqdm_hdl = _TqdmLogginHandler()
+        tqdm_hdl.setFormatter(hdl.formatter)
+        main_log.removeHandler(hdl)
+        main_log.addHandler(tqdm_hdl)
+        self.tqdm = tqdm.tqdm_notebook(total=max_steps, desc=None)
+    def before_run(self, state):
+        """doc"""
+        self.tqdm.n = state.gstep
+        self.tqdm.refresh()
+        return []
+    def __del__(self):
+        """doc"""
+        if self.tqdm:
+            self.tqdm.close()
+class LoggingHook(RunHook):
+    """log tensor in to screan and tensorboard"""
+    def __init__(self,
+                 loss,
+                 per_step=10,
+                 skip_step=100,
+                 summary_writer=None,
+                 summary_record=None):
+        """doc"""
+        if per_step is None or skip_step is None:
+            raise ValueError('wrong step argument, per step: %d skip_step %d' %
+                             (per_step, skip_step))
+        self.loss = loss
+        self.per_step = per_step
+        self.skip_step = skip_step
+        self.summary_record = summary_record
+        self.writer = summary_writer
+        self.last_state = None
+    def before_train(self, program):
+        """doc"""
+        if self.summary_record:
+            if self.summary_record.scalar:
+                self.s_name, self.s_tolog = zip(*self.summary_record.scalar)
+            else:
+                self.s_name, self.s_tolog = [], []
+            if self.summary_record.histogram:
+                self.h_name, self.h_tolog = zip(*self.summary_record.histogram)
+            else:
+                self.h_name, self.h_tolog = [], []
+    def before_run(self, state):
+        """doc"""
+        if state.gstep % self.per_step == 0 and state.step > self.skip_step:
+            ret = [self.loss]
+            if self.summary_record:
+                ret += self.s_tolog
+                ret += self.h_tolog
+            return ret
+        else:
+            return []
+    def after_run(self, res_list, state):
+        """doc"""
+        if state.gstep % self.per_step == 0 and state.step > self.skip_step:
+            if not self.summary_record:
+                return
+            loss = float(res_list[0])
+            s_np = res_list[1:1 + len(self.s_name)]
+            h_np = res_list[1 + len(self.s_name):1 + len(self.s_name) + len(
+                self.h_name)]
+            if self.last_state is not None:
+                speed = (state.gstep - self.last_state.gstep) / (
+                    state.time - self.last_state.time)
+            else:
+                speed = -1.
+            self.last_state = state
+            # log to tensorboard
+            if self.writer is not None:
+                self.writer.add_scalar('loss', loss, state.gstep)
+                for name, t in zip(self.s_name, s_np):
+                    if np.isnan(t).any():
+                        log.warning('Nan summary: %s, skip' % name)
+                    else:
+                        self.writer.add_scalar(name, t, state.gstep)
+                for name, t in zip(self.h_name, h_np):
+                    if np.isnan(t).any():
+                        log.warning('Nan summary: %s, skip' % name)
+                    else:
+                        self.writer.add_histogram(name, t, state.gstep)
+                if speed > 0.:
+                    self.writer.add_scalar('global_step', speed, state.gstep)
+            # log to stdout
+            log.debug('\t'.join([
+                'step: %d' % state.gstep,
+                'steps/sec: %.5f' % speed,
+                'loss: %.5f' % loss,
+                '' if self.summary_record is None else ' '.join(
+                    map(lambda t: '%s:%s' % t, zip(self.s_name, s_np))),
+            ]))
+class StopAtStepHook(RunHook):
+    """stop training at some step"""
+    def __init__(self, stop_global_step, stop_step):
+        """doc"""
+        self._stop_gstep = stop_global_step
+        self._stop_step = stop_step
+    def should_stop(self, state):
+        """doc"""
+        if (self._stop_gstep and state.gstep >= self._stop_gstep) or \
+           (self._stop_step and state.step >= self._stop_step):
+            log.info('StopAtStepHook called stop')
+            return True
+        else:
+            return False
+class EvalHook(RunHook):
+    """hook this on a eval Executor"""
+    def __init__(self, metrics, summary_writer=None):
+        """doc"""
+        self.writer = summary_writer
+        self._result = None
+        if not isinstance(metrics, dict):
+            raise ValueError('metrics should be dict, got %s' % repr(metrics))
+        for k, m in six.iteritems(metrics):
+            if not isinstance(m, Metrics):
+                raise ValueError(
+                    'metrics %s should be instance of propeller.Metrics, got %s'
+                    % (k, repr(m)))
+        if len(metrics):
+            self.names = list(metrics.keys())
+            self.metrics = list(metrics.values())
+        else:
+            self.names, self.metrics = [], []
+    def before_train(self, program):
+        """doc"""
+        for m in self.metrics:
+            m.reset()
+    def before_run(self, state):
+        """doc"""
+        ls = [m.tensor for m in self.metrics]
+        for i in ls:
+            if not (isinstance(i, list) or isinstance(i, tuple)):
+                raise ValueError(
+                    'metrics should return tuple or list of tensors, got %s' %
+                    repr(i))
+            for ii in i:
+                if not isinstance(ii, F.framework.Variable):
+                    raise ValueError(
+                        'metrics tensor be propeller.train.Metrics, got %s of type %s'
+                        % (repr(ii), type(ii)))
+        ls_flt, self.schema = util.flatten(ls)
+        #log.debug(ls_flt)
+        return ls_flt
+    def after_run(self, res_list, state):
+        """doc"""
+        res = util.unflatten(res_list, self.schema)
+        for r, m in zip(res, self.metrics):
+            m.update(r)
+    @property
+    def result(self):
+        """doc"""
+        return self._result
+    def after_train(self):
+        """doc"""
+        printable = []
+        self._result = {}
+        for n, m in zip(self.names, self.metrics):
+            val = m.eval()
+            self._result[n] = val
+        return self.result
+class CheckpointSaverHook(RunHook):
+    """Save checkpoint every n step"""
+    def __init__(self, saver, per_step=10, skip_step=100):
+        """doc"""
+        self.saver = saver
+        self.per_step = per_step
+        self.skip_step = skip_step
+    def after_run(self, res_list, state):
+        """doc"""
+        if state.gstep % self.per_step == 0 and \
+                state.step > self.skip_step:
+            self.saver.save(state)
--- a/propeller/paddle/train/metrics.py
+++ b/propeller/paddle/train/metrics.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""predefined metrics"""
+import sys
+import os
+import six
+import numpy as np
+import itertools
+import logging
+import paddle.fluid as F
+import paddle.fluid.layers as L
+import sklearn.metrics
+log = logging.getLogger(__name__)
+__all__ = [
+    'Metrics', 'F1', 'Recall', 'Precision', 'Mrr', 'Mean', 'Acc', 'ChunkF1',
+    'RecallAtPrecision'
+]
+class Metrics(object):
+    """Metrics base class"""
+    def __init__(self):
+        """doc"""
+        self.saver = []
+    @property
+    def tensor(self):
+        """doc"""
+        pass
+    def update(self, *args):
+        """doc"""
+        pass
+    def eval(self):
+        """doc"""
+        pass
+class Mean(Metrics):
+    """doc"""
+    def __init__(self, t):
+        """doc"""
+        self.t = t
+        self.reset()
+    def reset(self):
+        """doc"""
+        self.saver = np.array([])
+    @property
+    def tensor(self):
+        """doc"""
+        self.t.persistable = True
+        return self.t,
+    def update(self, args):
+        """doc"""
+        t, = args
+        t = t.reshape([-1])
+        self.saver = np.concatenate([self.saver, t])
+    def eval(self):
+        """doc"""
+        return self.saver.mean()
+class Ppl(Mean):
+    """doc"""
+    def eval(self):
+        """doc"""
+        return np.exp(self.saver.mean())
+class Acc(Mean):
+    """doc"""
+    def __init__(self, label, pred):
+        """doc"""
+        self.eq = L.equal(pred, label)
+        self.reset()
+    @property
+    def tensor(self):
+        """doc"""
+        self.eq.persistable = True
+        return self.eq,
+class MSE(Mean):
+    """doc"""
+    def __init__(self, label, pred):
+        """doc"""
+        diff = pred - label
+        self.mse = diff * diff
+        self.reset()
+    @property
+    def tensor(self):
+        """doc"""
+        self.mse.persistable = True
+        return self.mse,
+class Cosine(Mean):
+    """doc"""
+    def __init__(self, label, pred):
+        """doc"""
+        self.cos = L.cos_sim(label, pred)
+        self.reset()
+    @property
+    def tensor(self):
+        """doc"""
+        self.cos.persistable = True
+        return self.cos,
+class Precision(Metrics):
+    """doc"""
+    def __init__(self, label, pred):
+        """doc"""
+        self.label = label
+        self.pred = pred
+        self.reset()
+    def reset(self):
+        """doc"""
+        self.label_saver = np.array([], dtype=np.bool)
+        self.pred_saver = np.array([], dtype=np.bool)
+    @property
+    def tensor(self):
+        """doc"""
+        self.label.persistable = True
+        self.pred.persistable = True
+        return self.label, self.pred
+    def update(self, args):
+        """doc"""
+        label, pred = args
+        label = label.reshape([-1]).astype(np.bool)
+        pred = pred.reshape([-1]).astype(np.bool)
+        if label.shape != pred.shape:
+            raise ValueError(
+                'Metrics precesion: input not match: label:%s pred:%s' %
+                (label, pred))
+        self.label_saver = np.concatenate([self.label_saver, label])
+        self.pred_saver = np.concatenate([self.pred_saver, pred])
+    def eval(self):
+        """doc"""
+        tp = (self.label_saver & self.pred_saver).astype(np.int64).sum()
+        p = self.pred_saver.astype(np.int64).sum()
+        return tp / p
+class Recall(Precision):
+    """doc"""
+    def eval(self):
+        """doc"""
+        tp = (self.label_saver & self.pred_saver).astype(np.int64).sum()
+        t = (self.label_saver).astype(np.int64).sum()
+        return tp / t
+class F1(Precision):
+    """doc"""
+    def eval(self):
+        """doc"""
+        tp = (self.label_saver & self.pred_saver).astype(np.int64).sum()
+        t = self.label_saver.astype(np.int64).sum()
+        p = self.pred_saver.astype(np.int64).sum()
+        precision = tp / (p + 1.e-6)
+        recall = tp / (t + 1.e-6)
+        return 2 * precision * recall / (precision + recall + 1.e-6)
+class Auc(Metrics):
+    """doc"""
+    def __init__(self, label, pred):
+        """doc"""
+        self.pred = pred
+        self.label = label
+        self.reset()
+    def reset(self):
+        """doc"""
+        self.pred_saver = np.array([], dtype=np.float32)
+        self.label_saver = np.array([], dtype=np.bool)
+    @property
+    def tensor(self):
+        """doc"""
+        self.pred.persistable = True
+        self.label.persistable = True
+        return [self.pred, self.label]
+    def update(self, args):
+        """doc"""
+        pred, label = args
+        pred = pred.reshape([-1]).astype(np.float32)
+        label = label.reshape([-1]).astype(np.bool)
+        self.pred_saver = np.concatenate([self.pred_saver, pred])
+        self.label_saver = np.concatenate([self.label_saver, label])
+    def eval(self):
+        """doc"""
+        fpr, tpr, thresholds = sklearn.metrics.roc_curve(
+            self.label_saver.astype(np.int64), self.pred_saver)
+        auc = sklearn.metrics.auc(fpr, tpr)
+        return auc
+class RecallAtPrecision(Auc):
+    """doc"""
+    def __init__(self, label, pred, precision=0.9):
+        """doc"""
+        super(RecallAtPrecision, self).__init__(label, pred)
+        self.precision = precision
+    def eval(self):
+        """doc"""
+        self.pred_saver = self.pred_saver.reshape(
+            [self.label_saver.size, -1])[:, -1]
+        precision, recall, thresholds = sklearn.metrics.precision_recall_curve(
+            self.label_saver, self.pred_saver)
+        for p, r in zip(precision, recall):
+            if p > self.precision:
+                return r
+class PrecisionAtThreshold(Auc):
+    """doc"""
+    def __init__(self, label, pred, threshold=0.5):
+        """doc"""
+        super().__init__(label, pred)
+        self.threshold = threshold
+    def eval(self):
+        """doc"""
+        infered = self.pred_saver > self.threshold
+        correct_num = np.array(infered & self.label_saver).sum()
+        infer_num = infered.sum()
+        return correct_num / (infer_num + 1.e-6)
+class Mrr(Metrics):
+    """doc"""
+    def __init__(self, qid, label, pred):
+        """doc"""
+        self.qid = qid
+        self.label = label
+        self.pred = pred
+        self.reset()
+    def reset(self):
+        """doc"""
+        self.qid_saver = np.array([], dtype=np.int64)
+        self.label_saver = np.array([], dtype=np.int64)
+        self.pred_saver = np.array([], dtype=np.float32)
+    @property
+    def tensor(self):
+        """doc"""
+        self.qid.persistable = True
+        self.label.persistable = True
+        self.pred.persistable = True
+        return [self.qid, self.label, self.pred]
+    def update(self, args):
+        """doc"""
+        qid, label, pred = args
+        if not (qid.shape[0] == label.shape[0] == pred.shape[0]):
+            raise ValueError(
+                'Mrr dimention not match: qid[%s] label[%s], pred[%s]' %
+                (qid.shape, label.shape, pred.shape))
+        self.qid_saver = np.concatenate(
+            [self.qid_saver, qid.reshape([-1]).astype(np.int64)])
+        self.label_saver = np.concatenate(
+            [self.label_saver, label.reshape([-1]).astype(np.int64)])
+        self.pred_saver = np.concatenate(
+            [self.pred_saver, pred.reshape([-1]).astype(np.float32)])
+    def eval(self):
+        """doc"""
+        def _key_func(tup):
+            return tup[0]
+        def _calc_func(tup):
+            ranks = [
+                1. / (rank + 1.)
+                for rank, (_, l, p) in enumerate(
+                    sorted(
+                        tup, key=lambda t: t[2], reverse=True)) if l != 0
+            ]
+            if len(ranks):
+                return ranks[0]
+            else:
+                return 0.
+        mrr_for_qid = [
+            _calc_func(tup)
+            for _, tup in itertools.groupby(
+                sorted(
+                    zip(self.qid_saver, self.label_saver, self.pred_saver),
+                    key=_key_func),
+                key=_key_func)
+        ]
+        mrr = np.float32(sum(mrr_for_qid) / len(mrr_for_qid))
+        return mrr
+class ChunkF1(Metrics):
+    """doc"""
+    def __init__(self, label, pred, seqlen, num_label):
+        """doc"""
+        self.label = label
+        self.pred = pred
+        self.seqlen = seqlen
+        self.null_index = num_label - 1
+        self.label_cnt = 0
+        self.pred_cnt = 0
+        self.correct_cnt = 0
+    def _extract_bio_chunk(self, seq):
+        chunks = []
+        cur_chunk = None
+        for index in range(len(seq)):
+            tag = seq[index]
+            tag_type = tag // 2
+            tag_pos = tag % 2
+            if tag == self.null_index:
+                if cur_chunk is not None:
+                    chunks.append(cur_chunk)
+                    cur_chunk = None
+                continue
+            if tag_pos == 0:
+                if cur_chunk is not None:
+                    chunks.append(cur_chunk)
+                    cur_chunk = {}
+                cur_chunk = {"st": index, "en": index + 1, "type": tag_type}
+            else:
+                if cur_chunk is None:
+                    cur_chunk = {
+                        "st": index,
+                        "en": index + 1,
+                        "type": tag_type
+                    }
+                    continue
+                if cur_chunk["type"] == tag_type:
+                    cur_chunk["en"] = index + 1
+                else:
+                    chunks.append(cur_chunk)
+                    cur_chunk = {
+                        "st": index,
+                        "en": index + 1,
+                        "type": tag_type
+                    }
+        if cur_chunk is not None:
+            chunks.append(cur_chunk)
+        return chunks
+    def reset(self):
+        """doc"""
+        self.label_cnt = 0
+        self.pred_cnt = 0
+        self.correct_cnt = 0
+    @property
+    def tensor(self):
+        """doc"""
+        self.pred.persistable = True
+        self.label.persistable = True
+        self.seqlen.persistable = True
+        return [self.pred, self.label, self.seqlen]
+    def update(self, args):
+        """doc"""
+        pred, label, seqlen = args
+        pred = pred.reshape([-1]).astype(np.int32).tolist()
+        label = label.reshape([-1]).astype(np.int32).tolist()
+        seqlen = seqlen.reshape([-1]).astype(np.int32).tolist()
+        max_len = 0
+        for l in seqlen:
+            max_len = max(max_len, l)
+        for i in range(len(seqlen)):
+            seq_st = i * max_len + 1
+            seq_en = seq_st + (seqlen[i] - 2)
+            pred_chunks = self._extract_bio_chunk(pred[seq_st:seq_en])
+            label_chunks = self._extract_bio_chunk(label[seq_st:seq_en])
+            self.pred_cnt += len(pred_chunks)
+            self.label_cnt += len(label_chunks)
+            pred_index = 0
+            label_index = 0
+            while label_index < len(label_chunks) and pred_index < len(
+                    pred_chunks):
+                if pred_chunks[pred_index]['st'] < label_chunks[label_index][
+                        'st']:
+                    pred_index += 1
+                elif pred_chunks[pred_index]['st'] > label_chunks[label_index][
+                        'st']:
+                    label_index += 1
+                else:
+                    if pred_chunks[pred_index]['en'] == label_chunks[label_index]['en'] \
+                            and pred_chunks[pred_index]['type'] == label_chunks[label_index]['type']:
+                        self.correct_cnt += 1
+                    pred_index += 1
+                    label_index += 1
+    def eval(self):
+        """doc"""
+        if self.pred_cnt == 0:
+            precision = 0.0
+        else:
+            precision = 1.0 * self.correct_cnt / self.pred_cnt
+        if self.label_cnt == 0:
+            recall = 0.0
+        else:
+            recall = 1.0 * self.correct_cnt / self.label_cnt
+        if self.correct_cnt == 0:
+            f1 = 0.0
+        else:
+            f1 = 2 * precision * recall / (precision + recall)
+        return np.float32(f1)
+class PNRatio(Metrics):
+    """doc"""
+    def __init__(self, qid, label, pred):
+        """doc"""
+        self.qid = qid
+        self.label = label
+        self.pred = pred
+        self.saver = {}
+    def reset(self):
+        """doc"""
+        self.saver = {}
+    @property
+    def tensor(self):
+        """doc"""
+        self.qid.persistable = True
+        self.label.persistable = True
+        self.pred.persistable = True
+        return [self.qid, self.label, self.pred]
+    def update(self, args):
+        """doc"""
+        qid, label, pred = args
+        if not (qid.shape[0] == label.shape[0] == pred.shape[0]):
+            raise ValueError('dimention not match: qid[%s] label[%s], pred[%s]'
+                             % (qid.shape, label.shape, pred.shape))
+        qid = qid.reshape([-1]).tolist()
+        label = label.reshape([-1]).tolist()
+        pred = pred.reshape([-1]).tolist()
+        assert len(qid) == len(label) == len(pred)
+        for q, l, p in zip(qid, label, pred):
+            if q not in self.saver:
+                self.saver[q] = []
+            self.saver[q].append((l, p))
+    def eval(self):
+        """doc"""
+        p = 0
+        n = 0
+        for qid, outputs in self.saver.items():
+            for i in range(0, len(outputs)):
+                l1, p1 = outputs[i]
+                for j in range(i + 1, len(outputs)):
+                    l2, p2 = outputs[j]
+                    if l1 > l2:
+                        if p1 > p2:
+                            p += 1
+                        elif p1 < p2:
+                            n += 1
+                    elif l1 < l2:
+                        if p1 < p2:
+                            p += 1
+                        elif p1 > p2:
+                            n += 1
+        pn = p / n if n > 0 else 0.0
+        return np.float32(pn)
+class BinaryPNRatio(PNRatio):
+    """doc"""
+    def __init__(self, qid, label, pred):
+        """doc"""
+        super(BinaryPNRatio, self).__init__(qid, label, pred)
+    def eval(self):
+        """doc"""
+        p = 0
+        n = 0
+        for qid, outputs in self.saver.items():
+            pos_set = []
+            neg_set = []
+            for label, score in outputs:
+                if label == 1:
+                    pos_set.append(score)
+                else:
+                    neg_set.append(score)
+            for ps in pos_set:
+                for ns in neg_set:
+                    if ps > ns:
+                        p += 1
+                    elif ps < ns:
+                        n += 1
+                    else:
+                        continue
+        pn = p / n if n > 0 else 0.0
+        return np.float32(pn)
+class PrecisionAtK(Metrics):
+    """doc"""
+    def __init__(self, qid, label, pred, k=1):
+        """doc"""
+        self.qid = qid
+        self.label = label
+        self.pred = pred
+        self.k = k
+        self.saver = {}
+    def reset(self):
+        """doc"""
+        self.saver = {}
+    @property
+    def tensor(self):
+        """doc"""
+        self.qid.persistable = True
+        self.label.persistable = True
+        self.pred.persistable = True
+        return [self.qid, self.label, self.pred]
+    def update(self, args):
+        """doc"""
+        qid, label, pred = args
+        if not (qid.shape[0] == label.shape[0] == pred.shape[0]):
+            raise ValueError('dimention not match: qid[%s] label[%s], pred[%s]'
+                             % (qid.shape, label.shape, pred.shape))
+        qid = qid.reshape([-1]).tolist()
+        label = label.reshape([-1]).tolist()
+        pred = pred.reshape([-1]).tolist()
+        assert len(qid) == len(label) == len(pred)
+        for q, l, p in zip(qid, label, pred):
+            if q not in self.saver:
+                self.saver[q] = []
+            self.saver[q].append((l, p))
+    def eval(self):
+        """doc"""
+        right = 0
+        total = 0
+        for v in self.saver.values():
+            v = sorted(v, key=lambda x: x[1], reverse=True)
+            k = min(self.k, len(v))
+            for i in range(k):
+                if v[i][0] == 1:
+                    right += 1
+                    break
+            total += 1
+        return np.float32(1.0 * right / total)
+#class SemanticRecallMetrics(Metrics):
+#    def __init__(self, qid, vec, type_id):
+#        self.qid = qid
+#        self.vec = vec
+#        self.type_id = type_id
+#        self.reset()
+#
+#    def reset(self):
+#        self.saver = []
+#
+#    @property
+#    def tensor(self):
+#        return [self.qid, self.vec, self.type_id]
+#
+#    def update(self, args):
+#        qid, vec, type_id = args
+#        self.saver.append((qid, vec, type_id))
+#
+#    def eval(self):
+#        dic = {}
+#        for qid, vec, type_id in self.saver():
+#            dic.setdefault(i, {}).setdefault(k, []).append(vec)
+#        
+#        for qid in dic:
+#            assert len(dic[qid]) == 3
+#            qvec = np.arrray(dic[qid][0])
+#            assert len(qvec) == 1
+#            ptvec = np.array(dic[qid][1])
+#            ntvec = np.array(dic[qid][2])
+#
+#            np.matmul(qvec, np.transpose(ptvec))
+#            np.matmul(qvec, np.transpose(ntvec))
+#            
--- a/propeller/paddle/train/monitored_executor.py
+++ b/propeller/paddle/train/monitored_executor.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+doc
+"""
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import unicode_literals
+import os
+import json
+from functools import reduce
+import six
+from time import time
+import shutil
+import logging
+import numpy as np
+import paddle.fluid as F
+import paddle.fluid.layers as L
+from propeller import util
+from propeller.types import StopException, ProgramPair
+from propeller.paddle.train import hooks
+from . import distribution
+log = logging.getLogger(__name__)
+__all__ = ['MonitoredExecutor', 'Saver']
+def _get_one_place():
+    return F.cuda_places()[0] if F.core.is_compiled_with_cuda(
+    ) else F.cpu_places()[0]
+class RunState(object):
+    """serializable Run state object"""
+    @classmethod
+    def from_str(cls, s):
+        """doc"""
+        j = json.loads(s)
+        ret = RunState()
+        ret._gstep = j['global_step']
+        ret._time = j['time']
+        ret._step = 0
+        return ret
+    def __init__(self):
+        """doc"""
+        self._gstep = 0
+        self._step = 0
+        self._time = time()
+    @property
+    def gstep(self):
+        """doc"""
+        return self._gstep
+    @property
+    def step(self):
+        """doc"""
+        return self._step
+    @property
+    def time(self):
+        """doc"""
+        return self._time
+    def __repr__(self):
+        """doc"""
+        return repr({'global_step': self._gstep, 'time': self._time})
+    def serialize(self):
+        """doc"""
+        return json.dumps({'global_step': self._gstep, 'time': self._time})
+    def next(self):
+        """doc"""
+        ret = RunState()
+        ret._gstep = self._gstep + 1
+        ret._step = self._step + 1
+        ret._time = time()
+        return ret
+class Saver(object):
+    """checkpoint saver and manager"""
+    def __init__(self,
+                 save_dir,
+                 exe,
+                 program,
+                 save_prefix='model',
+                 max_ckpt_to_keep=None):
+        """doc"""
+        if exe is not None:
+            assert isinstance(
+                exe, F.Executor
+            ), 'expect normal executor to save, got executor of type %s' % repr(
+                type(exe))
+        self._exe = exe
+        self._program = program
+        self._save_dir = save_dir
+        self._save_prefix = save_prefix
+        self._max_ckpt_to_keep = 10 if max_ckpt_to_keep is None else max_ckpt_to_keep
+        self.ckpt_info_path = os.path.join(save_dir, 'ckpt_info')
+        if os.path.exists(self.ckpt_info_path):
+            self.ckpt_list = [
+                p.strip() for p in open(self.ckpt_info_path).readlines()
+            ]
+            log.debug('ckpt_list in this Saver: %s' % (self.ckpt_list))
+        else:
+            self.ckpt_list = []
+    @property
+    def last_ckpt(self):
+        """doc"""
+        return self.ckpt_list[-1] if len(self.ckpt_list) else None
+    def save(self, state):
+        """doc"""
+        save_name = '%s_%d' % (self._save_prefix, state.gstep)
+        save_dir = os.path.join(self._save_dir, save_name)
+        tmp_dir = os.path.join(self._save_dir, 'tmp')
+        try:
+            shutil.rmtree(save_dir)
+            shutil.rmtree(tmp_dir)
+        except OSError:
+            pass
+        log.debug('saving step %d to %s' % (state.gstep, save_dir))
+        F.io.save_persistables(self._exe, tmp_dir, self._program)
+        shutil.move(tmp_dir, save_dir)
+        meta = state.serialize()
+        open(os.path.join(save_dir, 'meta'), 'w').write(meta)
+        self.ckpt_list.append(save_name)
+        if len(self.ckpt_list) > self._max_ckpt_to_keep:
+            ckpt_to_keep = self.ckpt_list[-self._max_ckpt_to_keep:]
+            ckpt_to_remove = set(self.ckpt_list) - set(ckpt_to_keep)
+            self.ckpt_list = ckpt_to_keep
+            for ckpt in ckpt_to_remove:
+                ckpt_dir = os.path.join(self._save_dir, ckpt)
+                if os.path.exists(ckpt_dir):
+                    shutil.rmtree(ckpt_dir)
+                    log.debug('No. of ckpt exceed %d, clean up: %s' %
+                              (self._max_ckpt_to_keep, ckpt_dir))
+        open(self.ckpt_info_path, 'w').write('\n'.join(self.ckpt_list))
+    def restore(self, ckpt=-1):
+        """doc"""
+        if isinstance(ckpt, int):
+            try:
+                path = os.path.join(self._save_dir, self.ckpt_list[ckpt])
+            except IndexError:
+                raise ValueError('invalid restore ckpt number %d' % ckpt)
+        elif isinstance(ckpt, six.string_types):
+            if not os.path.exists(ckpt):
+                raise ValueError('ckpt: %s not found' % ckpt)
+            path = ckpt
+        else:
+            raise ValueError('ckpt type not understood %s' % repr(ckpt))
+        meta_file = os.path.join(path, 'meta')
+        if not os.path.exists(meta_file):
+            raise RuntimeError('meta not found in restore dir: %s' % path)
+        state = RunState.from_str(open(meta_file).read())
+        log.info('restore from ckpt %s, ckpt-status: %s' % (path, repr(state)))
+        def _fn(v):
+            vpath = os.path.join(path, v.name)
+            if F.io.is_persistable(v):
+                if os.path.exists(vpath):
+                    return True
+                else:
+                    log.warning('var %s not found in checkpoint, ignored' %
+                                v.name)
+            return False
+        F.io.load_vars(
+            self._exe, path, main_program=self._program, predicate=_fn)
+        return state
+class MonitoredExecutor(object):
+    """An Executor wrapper handling the train loop"""
+    def __init__(
+            self,
+            executor,
+            program,
+            loss=None,  #must set in train
+            state=None,
+            run_config=None,  #none if not load
+            run_hooks=[],
+            warm_start_setting=None):
+        if not isinstance(executor, F.Executor):
+            raise ValueError('PE is no longer supported')
+        if isinstance(executor, F.ParallelExecutor):
+            raise ValueError('ParallelExecutor is deprecatd, use Executor')
+        self._exe = executor
+        self._hooks = run_hooks
+        self._state = RunState()  # might be overwrite in freeze
+        self._program = program
+        self._loss = loss
+        self._warm_start_setting = warm_start_setting
+        self._saver = None  # will set in prepare
+        self.result = None  # will set after train
+        if run_config is not None:
+            self._model_dir = run_config.model_dir
+            self._save_dir = run_config.model_dir
+            self._save_steps = run_config.save_steps
+            self._skip_steps = run_config.skip_steps if run_config.skip_steps else 100
+            self._save_prefix = 'model'
+            self._max_ckpt = run_config.max_ckpt
+    @property
+    def state(self):
+        """doc"""
+        return self._state
+    def init_or_restore_variables(self, ckpt=-1):
+        """
+        init vars or restore vars from model_dir
+        call before train
+        """
+        # The order of this 2 steps really matters
+        # 1. init train
+        F.Executor(_get_one_place()).run(self._program.startup_program)
+        # 2. restore param
+        if self._warm_start_setting is not None:
+            if not os.path.exists(self._warm_start_setting.from_dir):
+                raise ValueError('warm start dir not exists: %s' %
+                                 self._warm_start_setting.from_dir)
+            log.info("warm start from %s" % self._warm_start_setting.from_dir)
+            if self._warm_start_setting.predicate_fn is not None:
+                def _fn(v):
+                    ret = self._warm_start_setting.predicate_fn(v)
+                    if ret:
+                        log.info('warm start: %s' % v.name)
+                    return ret
+                F.io.load_vars(
+                    F.Executor(_get_one_place()),
+                    self._warm_start_setting.from_dir,
+                    main_program=self._program.train_program,
+                    predicate=_fn)
+            else:
+                raise NotImplementedError()
+        self._saver = Saver(
+            self._model_dir,
+            F.Executor(_get_one_place()),
+            program=self._program.train_program,
+            max_ckpt_to_keep=self._max_ckpt)
+        if self._saver.last_ckpt is not None:
+            self._state = self._saver.restore(ckpt)
+    def _freeze(self):
+        """
+        call before enter train loop
+        convert program to compiled program
+        will do nothing if loss is None i.e. not in train mode
+        """
+        if self._loss is None:
+            log.debug('will not freeze a program without loss')
+            return
+        if isinstance(self._program.train_program, F.compiler.CompiledProgram):
+            log.debug('program has already been built')
+            return
+        exec_strategy = F.ExecutionStrategy()
+        exec_strategy.num_threads = 4  #2 for fp32 4 for fp16
+        exec_strategy.use_experimental_executor = True
+        exec_strategy.num_iteration_per_drop_scope = 10  #important shit
+        build_strategy = F.BuildStrategy()
+        build_strategy.remove_unnecessary_lock = False
+        #build_strategy.fuse_broadcast_ops = True
+        build_strategy.num_trainers = distribution.status.num_replica
+        build_strategy.trainer_id = distribution.status.replica_id
+        build_strategy.memory_optimize = True
+        log.info('replica id %d of %d' % (distribution.status.replica_id,
+                                          distribution.status.num_replica))
+        program = F.CompiledProgram(
+            self._program.train_program).with_data_parallel(
+                loss_name=self._loss.name,
+                build_strategy=build_strategy,
+                exec_strategy=exec_strategy)
+        self._program = ProgramPair(
+            train_program=program,
+            startup_program=self._program.startup_program)
+    def __enter__(self):
+        """
+        prepapre before enter train loop
+        """
+        if F.core.is_compiled_with_cuda():
+            log.info('propeller runs in CUDA mode')
+        else:
+            log.info('propeller runs in CPU mode')
+        log.debug('freezing program')
+        self._freeze()
+        log.debug('done freezing')
+        log.info('********** Start Loop ************')
+        # TODO init
+        self.result = None
+        for h in self._hooks:
+            log.debug('train loop has hook %s' % h)
+            h.before_train(self._program)
+        return self
+    def run(self, fetch_list=[], *args, **kwargs):
+        """
+        wrapper for Executor.run
+        """
+        #log.debug('Executor running step %d' % self._state.gstep)
+        if self._hooks:
+            fetch_list = [fetch_list]
+            for h in self._hooks:
+                #log.debug('calling hook.before_run %s' % h)
+                fetch = h.before_run(self._state)
+                fetch_list.append(fetch)
+            fetch_list_len = map(len, fetch_list)
+            fetch_list, schema = util.flatten(fetch_list)
+            fetch_list = [
+                f.name if not isinstance(f, six.string_types) else f
+                for f in fetch_list
+            ]
+            #if len(set(fetch_list)) != len(fetch_list):
+            #    log.error('strange shit happend when fetch list has idetity tensors %s' % fetch_list)
+            #log.debug(fetch_list)
+            res = self._exe.run(self._program.train_program,
+                                fetch_list=fetch_list,
+                                *args,
+                                **kwargs)
+            res = [self._merge_result(r) for r in res]
+            #log.debug(res)
+            res = util.unflatten(res, schema)
+            ret, res = res[0], res[1:]
+            for r, h in zip(res, self._hooks):
+                #log.debug('calling hook.after_run')
+                h.after_run(r, self._state)
+            if any(map(lambda i: i.should_stop(self._state), self._hooks)):
+                raise StopException('hook call stop')
+        else:
+            ret = self._exe.run(self._program.train_program,
+                                fetch_list=fetch_list,
+                                *args,
+                                **kwargs)
+        self._state = self._state.next()
+        return ret
+    def __exit__(self, err_type, err_value, trace):
+        """
+        clean up things and report hook result when exit train loop
+        """
+        if (err_type is None) or isinstance(err_value, (
+                F.core.EOFException, StopException, KeyboardInterrupt)):
+            try:
+                log.info('********** Stop Loop ************')
+                self.result = []
+                for h in self._hooks:
+                    self.result.append(h.after_train())
+            except Exception as e:
+                log.exception('error occur after loop %s' % repr(e))
+        else:
+            log.info('********** Interupt Loop ************')
+            log.exception('error occur during loop %s: %s' %
+                          (err_type, err_value))
+    def _merge_result(self, ls):
+        """
+        merge results from multi gpu cards
+        """
+        dev_count = len(self._program.train_program._places) if isinstance(
+            self._program.train_program, F.compiler.CompiledProgram) else 1
+        if dev_count == 1:
+            return ls
+        else:
+            shape = (-1, ls.shape[0] // dev_count) + ls.shape[1:]
+            ret = np.reshape(ls, shape).mean(axis=0)
+            return ret
--- a/propeller/paddle/train/trainer.py
+++ b/propeller/paddle/train/trainer.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""common ML train and eval procedure"""
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import unicode_literals
+import os
+import itertools
+import six
+import inspect
+from collections import namedtuple
+from contextlib import contextmanager
+from six.moves import zip, map
+import logging
+from time import time
+import paddle.fluid as F
+import paddle.fluid.layers as L
+from propeller.types import RunMode, StopException, SummaryRecord, StopException
+from propeller.types import ModelSpec, InferenceSpec, ProgramPair, RunConfig
+from propeller.paddle import summary, collection
+from propeller.paddle.data.functional import Dataset
+from propeller.paddle.train import distribution
+from propeller.train.model import Model
+from propeller.paddle.train.monitored_executor import Saver
+from propeller.paddle.train import hooks, metrics
+from propeller.paddle.train.monitored_executor import MonitoredExecutor
+log = logging.getLogger(__name__)
+__all__ = ['train_and_eval', 'Learner']
+def _get_summary_writer(path):
+    summary_writer = None
+    try:
+        from tensorboardX import SummaryWriter
+        if distribution.status.is_master:
+            summary_writer = SummaryWriter(os.path.join(path))
+    except ImportError:
+        log.warning('tensorboardX not installed, will not log to tensorboard')
+    return summary_writer
+def _get_one_place():
+    return F.cuda_places()[0] if F.core.is_compiled_with_cuda(
+    ) else F.cpu_places()[0]
+def _log_eval_result(name, eval_result, swriter, state):
+    log.debug(eval_result)
+    printable = []
+    for n, val in six.iteritems(eval_result):
+        assert val.shape == (), 'metrics eval use float'
+        printable.append('{}\t{}'.format(n, val))
+        if swriter is not None:
+            swriter.add_scalar(n, val, state.gstep)
+            log.debug('write to tensorboard %s' % swriter.logdir)
+    if len(printable):
+        log.info('*** eval res: %10s ***' % name)
+        for p in printable:
+            log.info(p)
+        log.info('******************************')
+def _build_net(model_fn, features, mode, params, run_config):
+    model_spec = model_fn(
+        features=features, mode=mode, params=params, run_config=run_config)
+    if mode == RunMode.TRAIN:
+        if not isinstance(model_spec.loss, F.framework.Variable):
+            raise ValueError('model_spec.metrics should be Variable, got %s' %
+                             repr(model_spec.loss))
+        if not (model_spec.loss.shape == () or model_spec.loss.shape == (1, )):
+            raise ValueError('expect scarlar loss, got %s' %
+                             repr(model_spec.loss.shape))
+        model_spec.loss.persistable = True
+    elif mode == RunMode.EVAL:
+        if not isinstance(model_spec.metrics, dict):
+            raise ValueError('model_spec.metrics should be dict, got %s' %
+                             repr(model_spec.metrics))
+    elif mode == RunMode.PREDICT:
+        if not isinstance(model_spec.predictions, (list, tuple)):
+            raise ValueError('model_spec.predictions shuold be list, got %s' %
+                             repr(model_spec.predictions))
+    else:
+        raise ValueError('unkonw mode %s' % mode)
+    return model_spec
+class Learner(object):
+    """A Learner can train / eval / predict on a Dataset"""
+    def __init__(self,
+                 model_class_or_model_fn,
+                 run_config,
+                 params=None,
+                 warm_start_setting=None):
+        """
+        model_class_or_model_fn(callable|propeller.train.Model): `model_class_or_model_fn` be specified in 2 ways:
+            1. subclass of propeller.train.Model which implements:
+                1. \_\_init\_\_       (hyper_param, mode, run_config)
+                2. forward            (features) => (prediction)
+                3. backword           (loss) => None
+                4. loss               (predictoin) => (loss)
+                5. metrics (optional) (prediction) => (dict of propeller.Metrics)
+            2. a model_fn takes following args:
+                1. features
+                2. param
+                3. mode
+                4. run_config(optional)
+               and returns a `propeller.ModelSpec`
+        params: any python object, will pass to your `model_fn` or `propeller.train.Model`
+        run_config (propeller.RunConfig): run_config.max_steps should not be None.
+        warm_start_setting (propeller.WarmStartSetting): Optional. warm start variable will overwrite model variable.
+        """
+        if run_config.model_dir is None:
+            raise ValueError('model_dir should specified in run_config')
+        if issubclass(model_class_or_model_fn, Model):
+            _model_fn = _build_model_fn(model_class_or_model_fn)
+        elif inspect.isfunction(model_class_or_model_fn):
+            _model_fn = model_class_or_model_fn
+        else:
+            raise ValueError('unknown model %s' % model_class_or_model_fn)
+        self.model_fn = _model_fn
+        self.params = params
+        self.run_config = run_config
+        self.warm_start_setting = warm_start_setting
+    def _build_for_train(self, train_dataset):
+        train_dataset.name = 'train'
+        train_program = F.Program()
+        startup_prog = F.Program()
+        with F.program_guard(train_program, startup_prog):
+            with F.unique_name.guard():
+                with collection.Collections() as collections:
+                    log.info('Building Train Graph...')
+                    fea = train_dataset.features()
+                    model_spec = _build_net(self.model_fn, fea, RunMode.TRAIN,
+                                            self.params, self.run_config)
+                    log.info('Building Train Graph: Done')
+                scalars = collections.get(collection.Key.SUMMARY_SCALAR)
+                histograms = collections.get(collection.Key.SUMMARY_HISTOGRAM)
+                skip_optimize_ops = collections.get(
+                    collection.Key.SKIP_OPTIMIZE)
+                skip_opt = set()
+                if skip_optimize_ops is not None:
+                    skip_opt |= set(skip_optimize_ops)
+                if scalars is not None:
+                    skip_opt |= {t for _, t in scalars}
+                if histograms is not None:
+                    skip_opt |= {t for _, t in histograms}
+                skip_opt = list(skip_opt)
+        log.info(
+            'Train with: \n> Run_config: %s\n> Params: %s\n> Train_model_spec: %s\n'
+            % (repr(self.run_config), repr(self.params), repr(model_spec)))
+        summary_record = SummaryRecord(
+            scalar=collections.get(collection.Key.SUMMARY_SCALAR),
+            histogram=collections.get(collection.Key.SUMMARY_HISTOGRAM), )
+        return ProgramPair(
+            train_program=train_program,
+            startup_program=startup_prog), model_spec, summary_record
+    def _build_for_eval(self, ds):
+        ds.name = 'eval'
+        program = F.Program()
+        startup_prog = F.Program()
+        with F.program_guard(program, startup_prog):
+            #share var with Train net
+            with F.unique_name.guard():
+                log.info('Building Eval Graph')
+                fea = ds.features()
+                model_spec = _build_net(self.model_fn, fea, RunMode.EVAL,
+                                        self.params, self.run_config)
+                log.info('Done')
+        program = program.clone(for_test=True)
+        log.info(
+            'Eval with: \n> Run_config: %s\n> Params: %s\n> Train_model_spec: %s\n'
+            % (repr(self.run_config), repr(self.params), repr(model_spec)))
+        return ProgramPair(
+            train_program=program, startup_program=startup_prog), model_spec
+    def _build_for_predict(self, ds):
+        ds.name = 'predict'
+        program = F.Program()
+        startup_prog = F.Program()
+        with F.program_guard(program, startup_prog):
+            #share var with Train net
+            with F.unique_name.guard():
+                log.info('Building Predict Graph')
+                fea = ds.features()
+                model_spec = _build_net(self.model_fn, fea, RunMode.PREDICT,
+                                        self.params, self.run_config)
+                log.info('Done')
+        program = program.clone(for_test=True)
+        log.info(
+            'Predict with: \n> Run_config: %s\n> Params: %s\n> Train_model_spec: %s\n'
+            % (repr(self.run_config), repr(self.params), repr(model_spec)))
+        return ProgramPair(
+            train_program=program, startup_program=startup_prog), model_spec
+    def train(self, train_ds, train_hooks=[]):
+        """train on a `Dataset`"""
+        if not isinstance(train_ds, Dataset):
+            raise ValueError('expect dataset to be instance of Dataset, got %s'
+                             % repr(train_ds))
+        train_program, model_spec, summary_record = self._build_for_train(
+            train_ds)
+        train_run_hooks = [
+            hooks.StopAtStepHook(self.run_config.max_steps,
+                                 self.run_config.run_steps),
+            hooks.LoggingHook(
+                model_spec.loss,
+                summary_record=summary_record,
+                summary_writer=_get_summary_writer(
+                    os.path.join(self.run_config.model_dir, 'train_history')),
+                per_step=self.run_config.log_steps,
+                skip_step=self.run_config.skip_steps),
+        ]
+        if model_spec.train_hooks is not None:
+            train_run_hooks.extend(model_spec.train_hooks)
+        train_run_hooks.extend(train_hooks)
+        train_executor = F.Executor(_get_one_place())
+        mon_exe = MonitoredExecutor(
+            train_executor,
+            train_program,
+            loss=model_spec.loss,
+            run_config=self.run_config,
+            run_hooks=train_run_hooks,
+            warm_start_setting=self.warm_start_setting)
+        distribution.init_distribuition_env(
+            train_program)  #only initialize distribute training with 
+        mon_exe.init_or_restore_variables()
+        if distribution.status.is_master:
+            mon_exe._hooks.append(
+                hooks.CheckpointSaverHook(
+                    mon_exe._saver,
+                    per_step=mon_exe._save_steps,
+                    skip_step=mon_exe._skip_steps))
+        try:
+            with mon_exe:
+                for data in train_ds.start():
+                    mon_exe.run(feed=data)
+        except (StopException, F.core.EOFException) as e:
+            pass
+        return mon_exe.result
+    def evaluate(self, eval_dataset, eval_hooks=[]):
+        """eval on a `Dataset`"""
+        if not isinstance(eval_dataset, Dataset):
+            raise ValueError('expect dataset to be instance of Dataset, got %s'
+                             % repr(eval_dataset))
+        program, model_spec = self._build_for_eval(eval_dataset)
+        single_card_place = _get_one_place()
+        eval_executor = F.Executor(single_card_place)
+        eval_run_hooks = [
+            hooks.StopAtStepHook(self.run_config.eval_max_steps,
+                                 self.run_config.eval_max_steps),
+            hooks.EvalHook(model_spec.metrics, )
+        ]
+        if model_spec.eval_hooks is not None:
+            eval_run_hooks.extend(model_spec.eval_hooks)
+        eval_run_hooks.extend(eval_hooks)
+        mon_exe = MonitoredExecutor(
+            eval_executor,
+            program,
+            run_config=self.run_config,
+            run_hooks=eval_run_hooks)
+        mon_exe.init_or_restore_variables()
+        try:
+            with mon_exe:
+                for data in eval_dataset.start(places=[single_card_place]):
+                    mon_exe.run(feed=data)
+        except (StopException, F.core.EOFException) as e:
+            pass
+        _, eval_result = mon_exe.result
+        summary_writer = _get_summary_writer(
+            os.path.join(self.run_config.model_dir, 'eval_history'))
+        _log_eval_result('eval', eval_result, summary_writer, mon_exe.state)
+        return mon_exe.result
+    def predict(self,
+                predict_dataset,
+                ckpt=-1,
+                ckpt_path=None,
+                steps=-1,
+                split_batch=True):
+        """
+        Perform predictoin
+        will call `model_fn` and initiate user-specifed model in `propeller.RunMode.PREDICT` mode 
+        Args:
+            infer_dataset (propeller.data.Dataset): should not `shuffle` or `repeat`
+            steps (int): steps to predict, if None is specifed, 
+                will stop when `StopException` is raised in `infer_dataset`
+            ckpt_path (None|str): Path of a specific checkpoint to predict. 
+                If None, the latest checkpoint in model_dir is used. 
+                If there are no checkpoints in model_dir, 
+                prediction is run with newly initialized Variables instead of ones restored from checkpoint.
+            ckpt (int): deprecated args
+            split_batch (bool): if True, prediction of each example in a batch is returned.
+        Yields:
+            Evaluated values of predictions tensors.
+        """
+        if not isinstance(predict_dataset, Dataset):
+            raise ValueError('expect dataset to be instance of Dataset, got %s'
+                             % repr(predict_dataset))
+        program, model_spec = self._build_for_predict(predict_dataset)
+        single_card_place = _get_one_place()
+        executor = F.Executor(single_card_place)
+        pred_run_config = RunConfig(
+            run_steps=steps if steps == -1 else None,
+            model_dir=self.run_config.model_dir)
+        mon_exe = MonitoredExecutor(
+            executor,
+            program,
+            run_config=pred_run_config, )
+        mon_exe.init_or_restore_variables(ckpt
+                                          if ckpt_path is None else ckpt_path)
+        try:
+            with mon_exe:
+                log.info('Runining predict from dir: %s' % repr(mon_exe.state))
+                single_card_place = _get_one_place()
+                for data in predict_dataset.start(places=[single_card_place]):
+                    res = mon_exe.run(fetch_list=model_spec.predictions,
+                                      feed=data)
+                    if split_batch:
+                        res = map(lambda i: i.tolist(), res)
+                        res = zip(*res)  # transpose
+                        for r in res:
+                            yield r
+                    else:
+                        yield list(map(lambda i: i.tolist(), res))
+        except (StopException, F.core.EOFException) as e:
+            pass
+def train_and_eval(_placeholder=None,
+                   model_class_or_model_fn=None,
+                   params=None,
+                   run_config=None,
+                   train_dataset=None,
+                   eval_dataset=None,
+                   warm_start_setting=None,
+                   train_hooks=[],
+                   eval_hooks=[],
+                   exporters=[]):
+    """
+    Perform train and evaluate procesure. 
+    will call `model_fn` and initiate user-specifed model in `propeller.RunMode.PREDICT` mode 
+    Args:
+        model_class_or_model_fn(callable|propeller.train.Model): `model_class_or_model_fn` be specified in 2 ways:
+            1. subclass of propeller.train.Model
+            2. a model_fn takes following args: 1. features; 2. param; 3. mode; 4. run_config(optional)
+               and returns a `propeller.ModelSpec`
+        params: any python object, will pass to your `model_fn` or `propeller.train.Model`
+        run_config (propeller.RunConfig): run_config.max_steps should not be None.
+        train_dataset (propeller.paddle.data.Dataset): training will stop if global_step > run_config.max_steps.
+        eval_dataset (propeller.paddle.data.Dataset|dict): Optional, if Dict of propeller.data.Dataset were specified, 
+            will perform evluatation on every evaluation sets and report results.
+        warm_start_setting (propeller.WarmStartSetting): Optional. warm start variable will overwrite model variable.
+        train_hooks (list of propeller.paddle.train.RunHook): Optional.
+        eval_hooks (list of propeller.paddle.train.RunHook): Optional.
+        exporters (list of propeller.paddle.train.Exporter): Optional.
+    """
+    if _placeholder is not None:
+        raise ValueError('specify keyword args to this function')
+    if model_class_or_model_fn is None or params is None or run_config is None or train_dataset is None:
+        raise ValueError(
+            'some argument is None: model_class_or_model_fn:%s params:%s run_config:%s train_dataset:%s'
+            % (model_class_or_model_fn, params, run_config, train_dataset))
+    #init distribution env if envvir PROPELLER_DISCONFIG is set
+    if train_dataset is None:
+        raise ValueError('train dataset not specified')
+    if eval_dataset is None:
+        raise ValueError('eval dataset not specifed')
+    if not isinstance(eval_dataset, (dict, Dataset)):
+        raise ValueError(
+            'Eval dataset should be propeller.Dataset of a list of that, got: %s'
+            % eval_dataset)
+    if isinstance(eval_dataset, Dataset):
+        eval_dataset = {'eval': eval_dataset}
+    ds_list = list(eval_dataset.values())
+    for ds in ds_list:
+        ds.name = 'eval'
+    first = ds_list[0]
+    for d in ds_list[1:]:
+        if not first.__eq__(d):
+            raise ValueError(
+                'eval dataset has different output_shapes or types: %s' %
+                repr(ds_list))
+    est = Learner(
+        model_class_or_model_fn,
+        run_config,
+        params,
+        warm_start_setting=warm_start_setting)
+    class _EvalHookOnTrainLoop(hooks.RunHook):
+        def __init__(self):
+            self.program, self.model_spec = est._build_for_eval(
+                list(eval_dataset.values())[
+                    0])  #eval_datasets must have same output shapes
+            self.summary_writers = {
+                ds_name: _get_summary_writer(
+                    os.path.join(
+                        os.path.join(run_config.model_dir, 'eval_history'),
+                        ds_name))
+                for ds_name in eval_dataset
+            }
+        def after_run(self, _, state):
+            """doc"""
+            if state.step > run_config.skip_steps and state.gstep % run_config.eval_steps == 0:
+                eval_results = {}
+                for name, ds in six.iteritems(eval_dataset):
+                    ehooks = [
+                        hooks.StopAtStepHook(est.run_config.eval_max_steps,
+                                             est.run_config.eval_max_steps),
+                        hooks.EvalHook(
+                            self.model_spec.metrics,
+                            summary_writer=self.summary_writers[name], )
+                    ]
+                    single_card_place = _get_one_place()
+                    eval_executor = F.Executor(single_card_place)
+                    mon_exe = MonitoredExecutor(
+                        eval_executor,
+                        self.program,
+                        run_config=est.run_config,
+                        run_hooks=ehooks + eval_hooks)
+                    try:
+                        with mon_exe:
+                            for data in ds.start(places=[single_card_place]):
+                                mon_exe.run(feed=data)
+                    except (StopException, F.core.EOFException) as e:
+                        pass
+                    hook_results = mon_exe.result
+                    eval_res = hook_results[
+                        1]  # hook_results:  [StopAtStepHook, EvalHook, ...]
+                    eval_results[name] = eval_res
+                    _log_eval_result(name, eval_res,
+                                     self.summary_writers[name], state)
+                for exporter in exporters:
+                    exporter.export(eval_executor, self.program,
+                                    self.model_spec, eval_results, state)
+            else:
+                eval_results = {}
+            return eval_results
+    if distribution.status.is_master:
+        train_hooks.append(_EvalHookOnTrainLoop())
+    res = est.train(train_dataset, train_hooks=train_hooks)
+    return res
+def _build_model_fn(model_class):
+    def _model_fn(features, mode, params, run_config):
+        if mode != RunMode.PREDICT:
+            fea, label = features[:-1], features[-1]
+        else:
+            fea = features
+        model = model_class(params, mode, run_config=run_config)
+        pred = model.forward(fea)
+        if isinstance(pred, F.framework.Variable):
+            prediction = [pred]
+        else:
+            prediction = pred
+        if mode == RunMode.TRAIN:
+            loss = model.loss(pred, label)
+            model.backward(loss)
+            return ModelSpec(loss=loss, predictions=prediction, mode=mode)
+        elif mode == RunMode.EVAL:
+            loss = model.loss(pred, label)
+            me = model.metrics(pred, label)
+            inf_spec = InferenceSpec(inputs=fea, outputs=prediction)
+            if 'loss' not in me:
+                me['loss'] = metrics.Mean(loss)
+            return ModelSpec(
+                loss=loss,
+                predictions=prediction,
+                metrics=me,
+                mode=mode,
+                inference_spec=inf_spec)
+        elif mode == RunMode.PREDICT:
+            inf_spec = InferenceSpec(inputs=fea, outputs=prediction)
+            return ModelSpec(
+                predictions=prediction, mode=mode, inference_spec=inf_spec)
+        else:
+            raise RuntimeError('unknown run mode %s' % mode)
+    return _model_fn
--- a/propeller/service/__init__.py
+++ b/propeller/service/__init__.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""server"""
--- a/propeller/service/client.py
+++ b/propeller/service/client.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+import asyncio
+import threading
+import math
+import zmq
+import zmq.asyncio
+import numpy as np
+from propeller import log
+import propeller.service.utils as serv_utils
+class InferenceBaseClient(object):
+    def __init__(self, address):
+        self.context = zmq.Context()
+        self.address = address
+        self.socket = self.context.socket(zmq.REQ)
+        self.socket.connect(address)
+        log.info("Connecting to server... %s" % address)
+    def __call__(self, *args):
+        for arg in args:
+            if not isinstance(arg, np.ndarray):
+                raise ValueError('expect ndarray slot data, got %s' %
+                                 repr(arg))
+        request = serv_utils.nparray_list_serialize(args)
+        self.socket.send(request)
+        reply = self.socket.recv()
+        ret = serv_utils.nparray_list_deserialize(reply)
+        return ret
+class InferenceClient(InferenceBaseClient):
+    def __init__(self, address, batch_size=128, num_coroutine=10, timeout=10.):
+        self.loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(self.loop)
+        context = zmq.asyncio.Context()
+        self.socket_pool = [
+            context.socket(zmq.REQ) for _ in range(num_coroutine)
+        ]
+        log.info("Connecting to server... %s" % address)
+        for socket in self.socket_pool:
+            socket.connect(address)
+        self.num_coroutine = num_coroutine
+        self.batch_size = batch_size
+        self.timeout = int(timeout * 1000)
+    #yapf: disable
+    def __call__(self, *args):
+        for arg in args:
+            if not isinstance(arg, np.ndarray):
+                raise ValueError('expect ndarray slot data, got %s' %
+                                 repr(arg))
+        num_tasks = math.ceil(1. * args[0].shape[0] / self.batch_size)
+        rets = [None] * num_tasks
+        async def get(coroutine_idx=0, num_coroutine=1):
+            socket = self.socket_pool[coroutine_idx]
+            while coroutine_idx < num_tasks:
+                begin = coroutine_idx * self.batch_size
+                end = (coroutine_idx + 1) * self.batch_size
+                arr_list = [arg[begin:end] for arg in args]
+                request = serv_utils.nparray_list_serialize(arr_list)
+                try:
+                    await socket.send(request)
+                    await socket.poll(self.timeout, zmq.POLLIN)
+                    reply = await socket.recv(zmq.NOBLOCK)
+                    ret = serv_utils.nparray_list_deserialize(reply)
+                except Exception as e:
+                    log.exception(e)
+                    ret = None
+                rets[coroutine_idx] = ret
+                coroutine_idx += num_coroutine
+        futures = [
+            get(i, self.num_coroutine) for i in range(self.num_coroutine)
+        ]
+        self.loop.run_until_complete(asyncio.wait(futures))
+        for r in rets:
+            if r is None:
+                raise RuntimeError('Client call failed')
+        return [np.concatenate(col, 0) for col in zip(*rets)]
+    #yapf: enable
--- a/propeller/service/interface.proto
+++ b/propeller/service/interface.proto
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+syntax = "proto3";
+package interface;
+service Inference {
+  rpc Infer(Slots) returns (Slots){}
+}
+message Slots {
+  repeated Slot slots = 1;
+}
+message Slot {
+  enum Type {
+    // Pod Types
+    BOOL = 0;
+    INT16 = 1;
+    INT32 = 2;
+    INT64 = 3;
+    FP16 = 4;
+    FP32 = 5;
+    FP64 = 6;
+    // Tensor<size_t> is used in C++.
+    SIZE_T = 19;
+    UINT8 = 20;
+    INT8 = 21;
+  }
+  Type type = 1;
+  repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+  bytes data = 3;
+}
--- a/propeller/service/interface_pb2.py
+++ b/propeller/service/interface_pb2.py
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: interface.proto
+import sys
+_b = sys.version_info[0] < 3 and (lambda x: x) or (
+    lambda x: x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+# @@protoc_insertion_point(imports)
+_sym_db = _symbol_database.Default()
+DESCRIPTOR = _descriptor.FileDescriptor(
+    name='interface.proto',
+    package='interface',
+    syntax='proto3',
+    serialized_options=None,
+    serialized_pb=_b(
+        '\n\x0finterface.proto\x12\tinterface\"\'\n\x05Slots\x12\x1e\n\x05slots\x18\x01 \x03(\x0b\x32\x0f.interface.Slot\"\xb8\x01\n\x04Slot\x12\"\n\x04type\x18\x01 \x01(\x0e\x32\x14.interface.Slot.Type\x12\x0c\n\x04\x64ims\x18\x02 \x03(\x03\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\"p\n\x04Type\x12\x08\n\x04\x42OOL\x10\x00\x12\t\n\x05INT16\x10\x01\x12\t\n\x05INT32\x10\x02\x12\t\n\x05INT64\x10\x03\x12\x08\n\x04\x46P16\x10\x04\x12\x08\n\x04\x46P32\x10\x05\x12\x08\n\x04\x46P64\x10\x06\x12\n\n\x06SIZE_T\x10\x13\x12\t\n\x05UINT8\x10\x14\x12\x08\n\x04INT8\x10\x15\x32:\n\tInference\x12-\n\x05Infer\x12\x10.interface.Slots\x1a\x10.interface.Slots\"\x00\x62\x06proto3'
+    ))
+_SLOT_TYPE = _descriptor.EnumDescriptor(
+    name='Type',
+    full_name='interface.Slot.Type',
+    filename=None,
+    file=DESCRIPTOR,
+    values=[
+        _descriptor.EnumValueDescriptor(
+            name='BOOL', index=0, number=0, serialized_options=None,
+            type=None),
+        _descriptor.EnumValueDescriptor(
+            name='INT16',
+            index=1,
+            number=1,
+            serialized_options=None,
+            type=None),
+        _descriptor.EnumValueDescriptor(
+            name='INT32',
+            index=2,
+            number=2,
+            serialized_options=None,
+            type=None),
+        _descriptor.EnumValueDescriptor(
+            name='INT64',
+            index=3,
+            number=3,
+            serialized_options=None,
+            type=None),
+        _descriptor.EnumValueDescriptor(
+            name='FP16', index=4, number=4, serialized_options=None,
+            type=None),
+        _descriptor.EnumValueDescriptor(
+            name='FP32', index=5, number=5, serialized_options=None,
+            type=None),
+        _descriptor.EnumValueDescriptor(
+            name='FP64', index=6, number=6, serialized_options=None,
+            type=None),
+        _descriptor.EnumValueDescriptor(
+            name='SIZE_T',
+            index=7,
+            number=19,
+            serialized_options=None,
+            type=None),
+        _descriptor.EnumValueDescriptor(
+            name='UINT8',
+            index=8,
+            number=20,
+            serialized_options=None,
+            type=None),
+        _descriptor.EnumValueDescriptor(
+            name='INT8',
+            index=9,
+            number=21,
+            serialized_options=None,
+            type=None),
+    ],
+    containing_type=None,
+    serialized_options=None,
+    serialized_start=144,
+    serialized_end=256, )
+_sym_db.RegisterEnumDescriptor(_SLOT_TYPE)
+_SLOTS = _descriptor.Descriptor(
+    name='Slots',
+    full_name='interface.Slots',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='slots',
+            full_name='interface.Slots.slots',
+            index=0,
+            number=1,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    serialized_options=None,
+    is_extendable=False,
+    syntax='proto3',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=30,
+    serialized_end=69, )
+_SLOT = _descriptor.Descriptor(
+    name='Slot',
+    full_name='interface.Slot',
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name='type',
+            full_name='interface.Slot.type',
+            index=0,
+            number=1,
+            type=14,
+            cpp_type=8,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR),
+        _descriptor.FieldDescriptor(
+            name='dims',
+            full_name='interface.Slot.dims',
+            index=1,
+            number=2,
+            type=3,
+            cpp_type=2,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR),
+        _descriptor.FieldDescriptor(
+            name='data',
+            full_name='interface.Slot.data',
+            index=2,
+            number=3,
+            type=12,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=_b(""),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[_SLOT_TYPE, ],
+    serialized_options=None,
+    is_extendable=False,
+    syntax='proto3',
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=72,
+    serialized_end=256, )
+_SLOTS.fields_by_name['slots'].message_type = _SLOT
+_SLOT.fields_by_name['type'].enum_type = _SLOT_TYPE
+_SLOT_TYPE.containing_type = _SLOT
+DESCRIPTOR.message_types_by_name['Slots'] = _SLOTS
+DESCRIPTOR.message_types_by_name['Slot'] = _SLOT
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+Slots = _reflection.GeneratedProtocolMessageType(
+    'Slots',
+    (_message.Message, ),
+    {
+        'DESCRIPTOR': _SLOTS,
+        '__module__': 'interface_pb2'
+        # @@protoc_insertion_point(class_scope:interface.Slots)
+    })
+_sym_db.RegisterMessage(Slots)
+Slot = _reflection.GeneratedProtocolMessageType(
+    'Slot',
+    (_message.Message, ),
+    {
+        'DESCRIPTOR': _SLOT,
+        '__module__': 'interface_pb2'
+        # @@protoc_insertion_point(class_scope:interface.Slot)
+    })
+_sym_db.RegisterMessage(Slot)
+_INFERENCE = _descriptor.ServiceDescriptor(
+    name='Inference',
+    full_name='interface.Inference',
+    file=DESCRIPTOR,
+    index=0,
+    serialized_options=None,
+    serialized_start=258,
+    serialized_end=316,
+    methods=[
+        _descriptor.MethodDescriptor(
+            name='Infer',
+            full_name='interface.Inference.Infer',
+            index=0,
+            containing_service=None,
+            input_type=_SLOTS,
+            output_type=_SLOTS,
+            serialized_options=None, ),
+    ])
+_sym_db.RegisterServiceDescriptor(_INFERENCE)
+DESCRIPTOR.services_by_name['Inference'] = _INFERENCE
+# @@protoc_insertion_point(module_scope)
--- a/propeller/service/server.py
+++ b/propeller/service/server.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" 
+Never Never Never import paddle.fluid in main process, or any module would import fluid.
+"""
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+import os
+import logging
+import six
+from time import sleep, time
+import multiprocessing
+import zmq
+log = logging.getLogger(__name__)
+def _profile(msg):
+    def _decfn(fn):
+        def _retfn(*args, **kwargs):
+            start = time()
+            ret = fn(*args, **kwargs)
+            end = time()
+            log.debug('%s timecost: %.5f' % (msg, end - start))
+            return ret
+        return _retfn
+    return _decfn
+class Predictor(object):
+    """paddle predictor wrapper"""
+    def __init__(self, model_dir, device_idx=0):
+        import paddle.fluid as F
+        log.debug('create predictor on card %d' % device_idx)
+        config = F.core.AnalysisConfig(model_dir)
+        config.enable_use_gpu(5000, device_idx)
+        self._predictor = F.core.create_paddle_predictor(config)
+    @_profile('paddle')
+    def __call__(self, args):
+        for i, a in enumerate(args):
+            a.name = 'placeholder_%d' % i
+        res = self._predictor.run(args)
+        return res
+def run_worker(model_dir, device_idx, endpoint="ipc://worker.ipc"):
+    """worker process entrence"""
+    try:
+        log.debug("run_worker %s" % device_idx)
+        os.environ["CUDA_VISIBLE_DEVICES"] = os.getenv(
+            "CUDA_VISIBLE_DEVICES").split(",")[device_idx]
+        log.debug('cuda_env %s' % os.environ["CUDA_VISIBLE_DEVICES"])
+        import paddle.fluid as F
+        from propeller.service import interface_pb2
+        import propeller.service.utils as serv_utils
+        context = zmq.Context()
+        socket = context.socket(zmq.REP)
+        socket.connect(endpoint)
+        #socket.bind(endpoint)
+        log.debug("Predictor building %s" % device_idx)
+        predictor = Predictor(model_dir, 0)
+        log.debug("Predictor %s" % device_idx)
+    except Exception as e:
+        log.exception(e)
+    while True:
+        #  Wait for next request from client
+        try:
+            message = socket.recv()
+            log.debug("get message %s" % device_idx)
+            slots = interface_pb2.Slots()
+            slots.ParseFromString(message)
+            pts = [serv_utils.slot_to_paddlearray(s) for s in slots.slots]
+            ret = predictor(pts)
+            slots = interface_pb2.Slots(
+                slots=[serv_utils.paddlearray_to_slot(r) for r in ret])
+            socket.send(slots.SerializeToString())
+        except Exception as e:
+            log.exception(e)
+            socket.send(e.message)
+class InferencePredictor(object):
+    """control Predictor for multi gpu card"""
+    def __init__(self, backend_addr, model_dir, n_devices=1):
+        self.backend_addr = backend_addr
+        self.model_dir = model_dir
+        self.n_devices = n_devices
+        self.children = []
+    def start(self):
+        """doc"""
+        for device_idx in range(self.n_devices):
+            p = multiprocessing.Process(
+                target=run_worker,
+                args=(self.model_dir, device_idx, self.backend_addr))
+            p.start()
+            self.children.append(p)
+        return self
+    def join(self):
+        """doc"""
+        for p in self.children:
+            p.join()
+    def term(self):
+        """doc"""
+        for p in self.children:
+            log.debug("terminating children %s" % repr(p))
+            p.terminate()
+class InferenceProxy(object):
+    """zmq proxy"""
+    def __init__(self):
+        """doc"""
+        self.backend = None
+        self.frontend = None
+    def listen(self, frontend_addr, backend_addr):
+        """doc"""
+        log.info("InferenceProxy starting...")
+        try:
+            context = zmq.Context(1)
+            # Socket facing clients
+            self.frontend = context.socket(zmq.ROUTER)
+            self.frontend.bind(frontend_addr)
+            # Socket facing services
+            self.backend = context.socket(zmq.DEALER)
+            self.backend.bind(backend_addr)
+            log.info("Queue init done")
+            zmq.device(zmq.QUEUE, self.frontend, self.backend)
+        except Exception as e:
+            log.exception(e)
+            log.info("Bringing down zmq device")
+        finally:
+            log.debug('terminating proxy')
+            if self.frontend is not None:
+                self.frontend.close()
+            if self.backend is not None:
+                self.backend.close()
+            context.term()
+class InferenceServer(object):
+    """start InferencePredictor and InferenceProxy"""
+    def __init__(self, model_dir, n_devices):
+        """doc"""
+        self.model_dir = model_dir
+        self.n_devices = n_devices
+    def listen(self, port):
+        """doc"""
+        frontend_addr = "tcp://*:%s" % port
+        backend_addr = "ipc://backend.ipc"
+        predictor = InferencePredictor(backend_addr, self.model_dir,
+                                       self.n_devices).start()
+        try:
+            proxy = InferenceProxy()
+            proxy.listen(frontend_addr, backend_addr)
+            predictor.join()
+        except KeyboardInterrupt:
+            log.debug('terminating  server')
+            predictor.term()
--- a/propeller/service/utils.py
+++ b/propeller/service/utils.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""utils for server"""
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+import numpy as np
+import struct
+from propeller.service import interface_pb2
+def slot_to_numpy(slot):
+    """doc"""
+    if slot.type == interface_pb2.Slot.FP32:
+        dtype = np.float32
+        type_str = 'f'
+    elif slot.type == interface_pb2.Slot.INT32:
+        type_str = 'i'
+        dtype = np.int32
+    elif slot.type == interface_pb2.Slot.INT64:
+        dtype = np.int64
+        type_str = 'q'
+    else:
+        raise RuntimeError('know type %s' % slot.type)
+    num = len(slot.data) // struct.calcsize(type_str)
+    arr = struct.unpack('%d%s' % (num, type_str), slot.data)
+    shape = slot.dims
+    ret = np.array(arr, dtype=dtype).reshape(shape)
+    return ret
+def numpy_to_slot(arr):
+    """doc"""
+    if arr.dtype == np.float32:
+        dtype = interface_pb2.Slot.FP32
+    elif arr.dtype == np.int32:
+        dtype = interface_pb2.Slot.INT32
+    elif arr.dtype == np.int64:
+        dtype = interface_pb2.Slot.INT64
+    else:
+        raise RuntimeError('know type %s' % arr.dtype)
+    pb = interface_pb2.Slot(
+        type=dtype, dims=list(arr.shape), data=arr.tobytes())
+    return pb
+def slot_to_paddlearray(slot):
+    """doc"""
+    import paddle.fluid.core as core
+    if slot.type == interface_pb2.Slot.FP32:
+        dtype = np.float32
+        type_str = 'f'
+    elif slot.type == interface_pb2.Slot.INT32:
+        dtype = np.int32
+        type_str = 'i'
+    elif slot.type == interface_pb2.Slot.INT64:
+        dtype = np.int64
+        type_str = 'q'
+    else:
+        raise RuntimeError('know type %s' % slot.type)
+    num = len(slot.data) // struct.calcsize(type_str)
+    arr = struct.unpack('%d%s' % (num, type_str), slot.data)
+    ret = core.PaddleTensor(data=np.array(arr, dtype=dtype).reshape(slot.dims))
+    return ret
+def paddlearray_to_slot(arr):
+    """doc"""
+    import paddle.fluid.core as core
+    if arr.dtype == core.PaddleDType.FLOAT32:
+        dtype = interface_pb2.Slot.FP32
+        type_str = 'f'
+        arr_data = arr.data.float_data()
+    elif arr.dtype == core.PaddleDType.INT32:
+        dtype = interface_pb2.Slot.INT32
+        type_str = 'i'
+        arr_data = arr.data.int32_data()
+    elif arr.dtype == core.PaddleDType.INT64:
+        dtype = interface_pb2.Slot.INT64
+        type_str = 'q'
+        arr_data = arr.data.int64_data()
+    else:
+        raise RuntimeError('know type %s' % arr.dtype)
+    data = struct.pack('%d%s' % (len(arr_data), type_str), *arr_data)
+    pb = interface_pb2.Slot(type=dtype, dims=list(arr.shape), data=data)
+    return pb
+def nparray_list_serialize(arr_list):
+    """doc"""
+    slot_list = [numpy_to_slot(arr) for arr in arr_list]
+    slots = interface_pb2.Slots(slots=slot_list)
+    return slots.SerializeToString()
+def nparray_list_deserialize(string):
+    """doc"""
+    slots = interface_pb2.Slots()
+    slots.ParseFromString(string)
+    return [slot_to_numpy(slot) for slot in slots.slots]
--- a/propeller/tools/__init__.py
+++ b/propeller/tools/__init__.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/propeller/tools/ckpt_inspector.py
+++ b/propeller/tools/ckpt_inspector.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import sys
+import os
+import struct
+import logging
+import argparse
+import numpy as np
+import collections
+from distutils import dir_util
+import pickle
+#from utils import print_arguments 
+import paddle.fluid as F
+from paddle.fluid.proto import framework_pb2
+log = logging.getLogger(__name__)
+formatter = logging.Formatter(
+    fmt='[%(levelname)s] %(asctime)s [%(filename)12s:%(lineno)5d]:\t%(message)s'
+)
+console = logging.StreamHandler()
+console.setFormatter(formatter)
+log.addHandler(console)
+log.setLevel(logging.DEBUG)
+def gen_arr(data, dtype):
+    num = len(data) // struct.calcsize(dtype)
+    arr = struct.unpack('%d%s' % (num, dtype), data)
+    return arr
+def parse(filename):
+    with open(filename, 'rb') as f:
+        read = lambda fmt: struct.unpack(fmt, f.read(struct.calcsize(fmt)))
+        _, = read('I')  # version
+        lodsize, = read('Q')
+        if lodsize != 0:
+            log.warning('shit, it is LOD tensor!!! skipped!!')
+            return None
+        _, = read('I')  # version
+        pbsize, = read('i')
+        data = f.read(pbsize)
+        proto = framework_pb2.VarType.TensorDesc()
+        proto.ParseFromString(data)
+        log.info('type: [%s] dim %s' % (proto.data_type, proto.dims))
+        if proto.data_type == framework_pb2.VarType.FP32:
+            arr = np.array(
+                gen_arr(f.read(), 'f'), dtype=np.float32).reshape(proto.dims)
+        elif proto.data_type == framework_pb2.VarType.INT64:
+            arr = np.array(
+                gen_arr(f.read(), 'q'), dtype=np.int64).reshape(proto.dims)
+        elif proto.data_type == framework_pb2.VarType.INT32:
+            arr = np.array(
+                gen_arr(f.read(), 'i'), dtype=np.int32).reshape(proto.dims)
+        elif proto.data_type == framework_pb2.VarType.INT8:
+            arr = np.array(
+                gen_arr(f.read(), 'B'), dtype=np.int8).reshape(proto.dims)
+        elif proto.data_type == framework_pb2.VarType.FP16:
+            arr = np.array(
+                gen_arr(f.read(), 'H'),
+                dtype=np.uint16).view(np.float16).reshape(proto.dims)
+        else:
+            raise RuntimeError('Unknown dtype %s' % proto.data_type)
+        return arr
+def show(arr):
+    print(repr(arr))
+def dump(arr, path):
+    path = os.path.join(args.to, path)
+    log.info('dump to %s' % path)
+    try:
+        os.makedirs(os.path.dirname(path))
+    except FileExistsError:
+        pass
+    pickle.dump(arr, open(path, 'wb'), protocol=4)
+def list_dir(dir_or_file):
+    if os.path.isfile(dir_or_file):
+        return [dir_or_file]
+    else:
+        return [
+            os.path.join(i, kk) for i, _, k in os.walk(dir_or_file) for kk in k
+        ]
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('mode', choices=['show', 'dump'], type=str)
+    parser.add_argument('file_or_dir', type=str)
+    parser.add_argument('-t', "--to", type=str, default=None)
+    parser.add_argument('-v', "--verbose", action='store_true')
+    args = parser.parse_args()
+    files = list_dir(args.file_or_dir)
+    parsed_arr = map(parse, files)
+    if args.mode == 'show':
+        for arr in parsed_arr:
+            if arr is not None:
+                show(arr)
+    elif args.mode == 'dump':
+        if args.to is None:
+            raise ValueError('--to dir_name not specified')
+        for arr, path in zip(parsed_arr, files):
+            if arr is not None:
+                dump(arr, path.replace(args.file_or_dir, ''))
--- a/propeller/tools/start_server.py
+++ b/propeller/tools/start_server.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import division
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+import sys
+import os
+import argparse
+import logging
+import logging.handlers
+from propeller.service.server import InferenceServer
+from propeller import log
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-m', '--model_dir', type=str, required=True)
+    parser.add_argument('-p', '--port', type=int, required=True)
+    parser.add_argument('-v', '--verbose', action='store_true')
+    args = parser.parse_args()
+    if args.verbose:
+        log.setLevel(logging.DEBUG)
+    n_devices = len(os.getenv("CUDA_VISIBLE_DEVICES").split(","))
+    server = InferenceServer(args.model_dir, n_devices)
+    log.info('propeller server listent on port %d' % args.port)
+    server.listen(args.port)
--- a/propeller/train/__init__.py
+++ b/propeller/train/__init__.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+doc
+"""
--- a/propeller/train/model.py
+++ b/propeller/train/model.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Model template
+"""
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import unicode_literals
+import sys
+import six
+import logging
+import os
+import itertools
+import json
+import abc
+import numpy as np
+@six.add_metaclass(abc.ABCMeta)
+class Model(object):
+    """
+    template
+    """
+    def __init__(self, config, mode):
+        """
+        Args:
+            config (dict): hyper param
+            mode (propeller.RunMode):  will creat `TRAIN` and `EVAL` model in propeller.train_and_eval
+        """
+        self.mode = mode
+    @abc.abstractmethod
+    def forward(self, features):
+        """
+        Args:
+            features (list of Tensor): inputs features that depends on your Dataset.output_shapes
+        Returns:
+            return (Tensor): prediction
+        """
+        pass
+    @abc.abstractmethod
+    def loss(self, predictions, label):
+        """
+        Args:
+            predictions (Tensor): result of  `self.forward`
+            label (Tensor): depends on your Dataset.output_shapes
+        Returns:
+            return (paddle scalar): loss
+        """
+        pass
+    @abc.abstractmethod
+    def backward(self, loss):
+        """
+        Call in TRAIN mode
+        Args:
+            loss (Tensor): result of `self.loss`
+        Returns:
+            None
+        """
+        pass
+    @abc.abstractmethod
+    def metrics(self, predictions, label):
+        """
+        Call in EVAL mode
+        Args:
+            predictions (Tensor): result of  `self.forward`
+            label (Tensor): depends on your Dataset.output_shapes
+        Returns:
+            (dict): k-v map like: {"metrics_name": propeller.Metrics } 
+        """
+        return {}
--- a/propeller/types.py
+++ b/propeller/types.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Basic types"""
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import unicode_literals
+import os
+import json
+from collections import namedtuple
+class RunMode(object):
+    """model_fn will be called in 3 modes"""
+    TRAIN = 1
+    PREDICT = 2
+    EVAL = 3
+class HParams(object):
+    """Hyper paramerter"""
+    def __init__(self, **kwargs):
+        for k, v in kwargs.items():
+            self.__dict__[k] = v
+    def __contains__(self, key):
+        return key in self.__dict__
+    def __getitem__(self, key):
+        if key not in self.__dict__:
+            raise ValueError('key(%s) not in HParams.' % key)
+        return self.__dict__[key]
+    def __repr__(self):
+        return repr(self.to_dict())
+    def __setitem__(self, key, val):
+        self.__dict__[key] = val
+    @classmethod
+    def from_json(cls, json_str):
+        """doc"""
+        d = json.loads(json_str)
+        if type(d) != dict:
+            raise ValueError('json object must be dict.')
+        return HParams.from_dict(d)
+    def get(self, key, default=None):
+        """doc"""
+        return self.__dict__.get(key, default)
+    @classmethod
+    def from_dict(cls, d):
+        """doc"""
+        if type(d) != dict:
+            raise ValueError('input must be dict.')
+        hp = HParams(**d)
+        return hp
+    def to_json(self):
+        """doc"""
+        return json.dumps(self.__dict__)
+    def to_dict(self):
+        """doc"""
+        return self.__dict__
+    def join(self, other):
+        """doc"""
+        if not isinstance(other, HParams):
+            raise ValueError('input must be HParams instance.')
+        self.__dict__.update(**other.__dict__)
+        return self
+SummaryRecord = namedtuple('SummaryRecord', ['scalar', 'histogram'])
+WarmStartSetting = namedtuple('WarmStartSetting', ['predicate_fn', 'from_dir'])
+RunConfig = namedtuple('RunConfig', [
+    'model_dir', 'run_steps', 'max_steps', 'save_steps', 'eval_steps',
+    'eval_max_steps', 'skip_steps', 'log_steps', 'max_ckpt', 'shit'
+])
+RunConfig.__new__.__defaults__ = (None, ) * len(RunConfig._fields)
+ProgramPair = namedtuple('ProgramPair', ['train_program', 'startup_program'])
+InferenceSpec = namedtuple('InferenceSpec', ['inputs', 'outputs'])
+ModelSpec = namedtuple('ModelSpec', [
+    'loss',
+    'predictions',
+    'metrics',
+    'mode',
+    'inference_spec',
+    'train_hooks',
+    'eval_hooks',
+])
+ModelSpec.__new__.__defaults__ = (None, ) * len(ModelSpec._fields)
+class StopException(Exception):
+    """doc"""
+    pass
--- a/propeller/util.py
+++ b/propeller/util.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""global utils"""
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import unicode_literals
+import os
+import six
+import re
+import json
+import argparse
+import itertools
+import logging
+from functools import reduce
+from propeller.types import RunConfig
+from propeller.types import HParams
+log = logging.getLogger(__name__)
+def ArgumentParser(name):
+    """predefined argparser"""
+    parser = argparse.ArgumentParser('propeller model')
+    parser.add_argument('--run_config', type=str, default='')
+    parser.add_argument(
+        '--hparam', type=str, nargs='*', action='append', default=[['']])
+    return parser
+def _get_dict_from_environ_or_json_or_file(args, env_name):
+    if args == '':
+        return None
+    if args is None:
+        s = os.environ.get(env_name)
+    else:
+        s = args
+        if os.path.exists(s):
+            s = open(s).read()
+    if isinstance(s, six.string_types):
+        try:
+            r = eval(s)
+        except SyntaxError as e:
+            raise ValueError('json parse error: %s \n>Got json: %s' %
+                             (repr(e), s))
+        return r
+    else:
+        return s  #None
+def parse_file(filename):
+    """useless api"""
+    d = _get_dict_from_environ_or_json_or_file(filename, None)
+    if d is None:
+        raise ValueError('file(%s) not found' % filename)
+    return d
+def parse_runconfig(args=None):
+    """get run_config from env or file"""
+    d = _get_dict_from_environ_or_json_or_file(args.run_config,
+                                               'PROPELLER_RUNCONFIG')
+    if d is None:
+        raise ValueError('run_config not found')
+    return RunConfig(**d)
+def parse_hparam(args=None):
+    """get hparam from env or file"""
+    if args is not None:
+        hparam_strs = reduce(list.__add__, args.hparam)
+    else:
+        hparam_strs = [None]
+    hparams = [
+        _get_dict_from_environ_or_json_or_file(hp, 'PROPELLER_HPARAMS')
+        for hp in hparam_strs
+    ]
+    hparams = [HParams(**h) for h in hparams if h is not None]
+    if len(hparams) is None:
+        raise ValueError('hparam not found')
+    hparam = reduce(lambda x, y: x.join(y), hparams)
+    return hparam
+def flatten(s):
+    """doc"""
+    assert is_struture(s)
+    schema = [len(ss) for ss in s]
+    flt = list(itertools.chain(*s))
+    return flt, schema
+def unflatten(structure, schema):
+    """doc"""
+    start = 0
+    res = []
+    for _range in schema:
+        res.append(structure[start:start + _range])
+        start += _range
+    return res
+def is_struture(s):
+    """doc"""
+    return isinstance(s, list) or isinstance(s, tuple)
+def map_structure(func, s):
+    """same sa tf.map_structure"""
+    if isinstance(s, list) or isinstance(s, tuple):
+        return [map_structure(func, ss) for ss in s]
+    elif isinstance(s, dict):
+        return {k: map_structure(func, v) for k, v in six.iteritems(s)}
+    else:
+        return func(s)
--- a/requirements.txt
+++ b/requirements.txt
+nltk==3.4
+numpy==1.14.5
+pyzmq==18.0.2
+scikit-learn==0.20.3
+scipy==1.2.1
+six==1.11.0
+sklearn==0.0
+sentencepiece==0.1.8
+paddlepaddle-gpu==1.6.3.post107
--- a/script/en_glue/ernie_base/CoLA/task.sh
+++ b/script/en_glue/ernie_base/CoLA/task.sh
@@ -3,6 +3,7 @@
 R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_sync_nccl_allreduce=1
 export FLAGS_eager_delete_tensor_gb=0.0
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -20,7 +21,7 @@ batch_size=64
 epoch=3
 for i in {1..5};do
-python -u run_classifier.py                                                          \
+python -u ernie/run_classifier.py                                                          \
       --use_cuda true                                                               \
       --for_cn  False                                                               \
       --use_fast_executor ${e_executor:-"true"}                                     \

--- a/script/en_glue/ernie_base/MNLI/task.sh
+++ b/script/en_glue/ernie_base/MNLI/task.sh
@@ -3,6 +3,7 @@
 R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -20,7 +21,7 @@ for i in {1..5};do
 timestamp=`date "+%Y-%m-%d-%H-%M-%S"`
-python -u run_classifier.py                                                             \
+python -u ./ernie/run_classifier.py                                                             \
       --use_cuda true                                                                  \
       --use_fast_executor ${e_executor:-"true"}                                        \
       --tokenizer ${TOKENIZER:-"FullTokenizer"}                                        \

--- a/script/en_glue/ernie_base/MRPC/task.sh
+++ b/script/en_glue/ernie_base/MRPC/task.sh
@@ -3,6 +3,7 @@
 R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -19,7 +20,7 @@ epoch=4
 for i in {1..5};do
    timestamp=`date "+%Y-%m-%d-%H-%M-%S"`
-    python -u run_classifier.py                                              \
+    python -u ./ernie/run_classifier.py                                              \
           --use_cuda true                                                   \
           --for_cn  False                                                   \
           --use_fast_executor ${e_executor:-"true"}                         \

--- a/script/en_glue/ernie_base/QNLI/task.sh
+++ b/script/en_glue/ernie_base/QNLI/task.sh
@@ -4,6 +4,7 @@ R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -22,7 +23,7 @@ for i in {1..5};do
    timestamp=`date "+%Y-%m-%d-%H-%M-%S"`
-    python -u run_classifier.py                                                \
+    python -u ./ernie/run_classifier.py                                                \
           --use_cuda true                                                     \
           --for_cn False                                                      \
           --use_fast_executor ${e_executor:-"true"}                           \

--- a/script/en_glue/ernie_base/QQP/task.sh
+++ b/script/en_glue/ernie_base/QQP/task.sh
@@ -3,6 +3,7 @@
 R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -21,7 +22,7 @@ for i in {1..1};do
  timestamp=`date "+%Y-%m-%d-%H-%M-%S"`
-  python -u run_classifier.py                                                      \
+  python -u ./ernie/run_classifier.py                                                      \
       --for_cn False                                                              \
       --ernie_config_path script/en_glue/ernie_base/ernie_config.json             \
       --validation_steps 1000000000000                                            \

--- a/script/en_glue/ernie_base/RTE/task.sh
+++ b/script/en_glue/ernie_base/RTE/task.sh
@@ -3,6 +3,7 @@
 R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -16,7 +17,7 @@ for i in {1..5};do
    timestamp=`date "+%Y-%m-%d-%H-%M-%S"`
-    python -u run_classifier.py                                                \
+    python -u ./ernie/run_classifier.py                                                \
               --use_cuda true                                                 \
               --for_cn False                                                  \
               --use_fast_executor ${e_executor:-"true"}                       \

--- a/script/en_glue/ernie_base/SST-2/task.sh
+++ b/script/en_glue/ernie_base/SST-2/task.sh
@@ -3,6 +3,7 @@
 R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -18,7 +19,7 @@ epoch=4
 for i in {1..5};do
- python -u run_classifier.py                                                       \
+ python -u ./ernie/run_classifier.py                                                       \
      --for_cn  False                                                              \
      --use_cuda true                                                              \
      --use_fast_executor ${e_executor:-"true"}                                    \

--- a/script/en_glue/ernie_base/STS-B/task.sh
+++ b/script/en_glue/ernie_base/STS-B/task.sh
@@ -3,6 +3,7 @@
 R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -18,7 +19,7 @@ epoch=3
 for i in {1..5};do
-python -u run_classifier.py                                                         \
+python -u ./ernie/run_classifier.py                                                         \
       --use_cuda true                                                              \
       --for_cn  False                                                              \
       --use_fast_executor ${e_executor:-"true"}                                    \

--- a/script/en_glue/ernie_base/WNLI/task.sh
+++ b/script/en_glue/ernie_base/WNLI/task.sh
@@ -3,6 +3,7 @@
 R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -20,7 +21,7 @@ epoch=4
 for i in {1..5};do
-   python -u run_classifier.py                                            \
+   python -u ./ernie/run_classifier.py                                            \
       --for_cn False                                                     \
       --use_cuda true                                                    \
       --use_fast_executor ${e_executor:-"true"}                          \

--- a/script/en_glue/ernie_large/CoLA/task.sh
+++ b/script/en_glue/ernie_large/CoLA/task.sh
@@ -3,6 +3,7 @@
 R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_sync_nccl_allreduce=1
 export FLAGS_eager_delete_tensor_gb=0.0
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -19,7 +20,7 @@ epoch=5
 for i in {1..5};do
    timestamp=`date "+%Y-%m-%d-%H-%M-%S"`
-    python -u run_classifier.py                                              \
+    python -u ./ernie/run_classifier.py                                              \
           --use_cuda true                                                   \
           --for_cn  False                                                   \
           --use_fast_executor ${e_executor:-"true"}                         \

--- a/script/en_glue/ernie_large/MNLI/task.sh
+++ b/script/en_glue/ernie_large/MNLI/task.sh
@@ -4,6 +4,7 @@ R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -17,7 +18,7 @@ for i in {1..5};do
    timestamp=`date "+%Y-%m-%d-%H-%M-%S"`
-    python -u run_classifier.py                                                             \
+    python -u ./ernie/run_classifier.py                                                             \
           --use_cuda true                                                                  \
           --use_fast_executor ${e_executor:-"true"}                                        \
           --tokenizer ${TOKENIZER:-"FullTokenizer"}                                        \

--- a/script/en_glue/ernie_large/MRPC/task.sh
+++ b/script/en_glue/ernie_large/MRPC/task.sh
@@ -3,6 +3,7 @@
 R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -20,7 +21,7 @@ epoch=4
 for i in {1..5};do
    timestamp=`date "+%Y-%m-%d-%H-%M-%S"`
-    python -u run_classifier.py                                              \
+    python -u ./ernie/run_classifier.py                                              \
           --use_cuda true                                                   \
           --for_cn  False                                                   \
           --use_fast_executor ${e_executor:-"true"}                         \

--- a/script/en_glue/ernie_large/QNLI/task.sh
+++ b/script/en_glue/ernie_large/QNLI/task.sh
@@ -4,6 +4,7 @@ R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -21,7 +22,7 @@ for i in {1..5};do
    timestamp=`date "+%Y-%m-%d-%H-%M-%S"`
-    python -u run_classifier.py                                            \
+    python -u ./ernie/run_classifier.py                                            \
           --use_cuda true                                                 \
           --for_cn False                                                  \
           --use_fast_executor ${e_executor:-"true"}                       \

--- a/script/en_glue/ernie_large/QQP/task.sh
+++ b/script/en_glue/ernie_large/QQP/task.sh
@@ -3,6 +3,7 @@
 R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -16,7 +17,7 @@ for i in {1..5};do
  timestamp=`date "+%Y-%m-%d-%H-%M-%S"`
-  python -u run_classifier.py                                                      \
+  python -u ./ernie/run_classifier.py                                                      \
       --for_cn False                                                              \
       --ernie_config_path script/en_glue/ernie_large/ernie_config.json            \
       --validation_steps 1000000000000                                            \

--- a/script/en_glue/ernie_large/RTE/task.sh
+++ b/script/en_glue/ernie_large/RTE/task.sh
@@ -3,6 +3,7 @@
 R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -16,7 +17,7 @@ mkdir -p log/
 for i in {1..5};do
    timestamp=`date "+%Y-%m-%d-%H-%M-%S"`
-    python -u run_classifier.py                                             \
+    python -u ./ernie/run_classifier.py                                             \
               --use_cuda true                                              \
               --for_cn False                                               \
               --use_fast_executor ${e_executor:-"true"}                    \

--- a/script/en_glue/ernie_large/SST-2/task.sh
+++ b/script/en_glue/ernie_large/SST-2/task.sh
@@ -3,6 +3,7 @@
 R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
 mkdir -p log/
@@ -19,7 +20,7 @@ epoch=4
 for i in {1..5};do
- python -u run_classifier.py                                          \
+ python -u ./ernie/run_classifier.py                                          \
      --for_cn  False                                                 \
      --use_cuda true                                                 \
      --use_fast_executor ${e_executor:-"true"}                       \

--- a/script/en_glue/ernie_large/STS-B/task.sh
+++ b/script/en_glue/ernie_large/STS-B/task.sh
@@ -3,6 +3,7 @@
 R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -15,7 +16,7 @@ mkdir -p log/
 for i in {1..5};do
-python -u run_classifier.py                                             \
+python -u ./ernie/run_classifier.py                                             \
       --use_cuda true                                                  \
       --for_cn  False                                                  \
       --use_fast_executor ${e_executor:-"true"}                        \

--- a/script/en_glue/ernie_large/WNLI/task.sh
+++ b/script/en_glue/ernie_large/WNLI/task.sh
@@ -3,6 +3,7 @@
 R_DIR=`dirname $0`; MYDIR=`cd $R_DIR;pwd`
 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
 if [[ -f ./model_conf ]];then
    source ./model_conf
@@ -19,7 +20,7 @@ epoch=4
 for i in {1..5};do
-python -u run_classifier.py                                                \
+python -u ./ernie/run_classifier.py                                                \
       --for_cn False                                                      \
       --use_cuda true                                                     \
       --use_fast_executor ${e_executor:-"true"}                           \

--- a/script/zh_task/ernie_base/run_ChnSentiCorp.sh
+++ b/script/zh_task/ernie_base/run_ChnSentiCorp.sh
@@ -4,7 +4,8 @@ export FLAGS_eager_delete_tensor_gb=0
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0
-python -u run_classifier.py \
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python -u ./ernie/run_classifier.py \
                   --use_cuda true \
                   --verbose true \
                   --do_train true \

--- a/script/zh_task/ernie_base/run_bq.sh
+++ b/script/zh_task/ernie_base/run_bq.sh
@@ -4,7 +4,8 @@ export FLAGS_eager_delete_tensor_gb=0
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0
-python -u ./run_classifier.py \
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python -u ./ernie/run_classifier.py \
                     --use_cuda true \
                     --verbose true \
                     --do_train true \

--- a/script/zh_task/ernie_base/run_cmrc2018.sh
+++ b/script/zh_task/ernie_base/run_cmrc2018.sh
@@ -2,9 +2,15 @@ set -eux
 export FLAGS_eager_delete_tensor_gb=0
 export FLAGS_sync_nccl_allreduce=1
-export CUDA_VISIBLE_DEVICES=0,1,2,3
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-python -u run_mrc.py --use_cuda true\
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python ./ernie/finetune_launch.py  \
+    --nproc_per_node 8 \
+    --selected_gpus 0,1,2,3,4,5,6,7 \
+    --node_ips $(hostname -i) \
+    --node_id 0 \
+./ernie/run_mrc.py --use_cuda true\
                    --batch_size 16 \
                    --in_tokens false\
                    --use_fast_executor true \

--- a/script/zh_task/ernie_base/run_dbqa.sh
+++ b/script/zh_task/ernie_base/run_dbqa.sh
@@ -4,7 +4,13 @@ export FLAGS_eager_delete_tensor_gb=0
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-python -u run_classifier.py \
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python ./ernie/finetune_launch.py  \
+    --nproc_per_node 8 \
+    --selected_gpus 0,1,2,3,4,5,6,7 \
+    --node_ips $(hostname -i) \
+    --node_id 0 \
+./ernie/run_classifier.py \
                   --use_cuda true \
                   --verbose true \
                   --do_train true \

--- a/script/zh_task/ernie_base/run_drcd.sh
+++ b/script/zh_task/ernie_base/run_drcd.sh
@@ -2,9 +2,15 @@ set -eux
 export FLAGS_eager_delete_tensor_gb=0
 export FLAGS_sync_nccl_allreduce=1
-export CUDA_VISIBLE_DEVICES=0,1,2,3
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-python -u run_mrc.py --use_cuda true\
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python ./ernie/finetune_launch.py  \
+    --nproc_per_node 8 \
+    --selected_gpus 0,1,2,3,4,5,6,7 \
+    --node_ips $(hostname -i) \
+    --node_id 0 \
+./ernie/run_mrc.py --use_cuda true\
                    --batch_size 16 \
                    --in_tokens false\
                    --use_fast_executor true \

--- a/script/zh_task/ernie_base/run_lcqmc.sh
+++ b/script/zh_task/ernie_base/run_lcqmc.sh
@@ -4,7 +4,8 @@ export FLAGS_eager_delete_tensor_gb=0
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0
-python -u run_classifier.py \
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python -u ./ernie/run_classifier.py \
                   --use_cuda true \
                   --verbose true \
                   --do_train true \

--- a/script/zh_task/ernie_base/run_msra_ner.sh
+++ b/script/zh_task/ernie_base/run_msra_ner.sh
@@ -4,7 +4,8 @@ export FLAGS_eager_delete_tensor_gb=0
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0
-python -u run_sequence_labeling.py \
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python -u ./ernie/run_sequence_labeling.py \
                   --use_cuda true \
                   --do_train true \
                   --do_val true \
@@ -15,7 +16,7 @@ python -u run_sequence_labeling.py \
                   --chunk_scheme "IOB" \
                   --label_map_config ${TASK_DATA_PATH}/msra_ner/label_map.json \
                   --train_set ${TASK_DATA_PATH}/msra_ner/train.tsv \
-                   --dev_set ${TASK_DATA_PATH}/msra_ner/dev.tsv \
+   		   --dev_set ${TASK_DATA_PATH}/msra_ner/dev.tsv,${TASK_DATA_PATH}/msra_ner/test.tsv \
                   --test_set ${TASK_DATA_PATH}/msra_ner/test.tsv \
                   --vocab_path ${MODEL_PATH}/vocab.txt \
                   --ernie_config_path ${MODEL_PATH}/ernie_config.json \
@@ -24,6 +25,7 @@ python -u run_sequence_labeling.py \
                   --weight_decay  0.01 \
                   --warmup_proportion 0.0 \
                   --validation_steps 100 \
+                   --use_fp16 false \
                   --epoch 6 \
                   --max_seq_len 256 \
                   --learning_rate 5e-5 \

--- a/script/zh_task/ernie_base/run_thuc.sh
+++ b/script/zh_task/ernie_base/run_thuc.sh
@@ -4,7 +4,8 @@ export FLAGS_eager_delete_tensor_gb=0
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-python -u run_classifier.py \
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python -u ./ernie/run_classifier.py \
                   --use_cuda true \
                   --do_train true \
                   --do_val true \

--- a/script/zh_task/ernie_base/run_xnli.sh
+++ b/script/zh_task/ernie_base/run_xnli.sh
@@ -4,25 +4,32 @@ export FLAGS_eager_delete_tensor_gb=0
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-python -u run_classifier.py \
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python ./ernie/finetune_launch.py  \
+    --nproc_per_node 8 \
+    --selected_gpus 0,1,2,3,4,5,6,7 \
+    --node_ips $(hostname -i) \
+    --node_id 0 \
+./ernie/run_classifier.py \
   --use_cuda true \
   --do_train true \
   --do_val true \
   --do_test false \
   --verbose true \
-                   --batch_size 8192 \
   --in_tokens true \
-                   --init_pretraining_params ${MODEL_PATH}/params \
+    --batch_size 8192 \
   --train_set ${TASK_DATA_PATH}/xnli/train.tsv \
   --dev_set ${TASK_DATA_PATH}/xnli/dev.tsv,${TASK_DATA_PATH}/xnli/test.tsv \
-                   --vocab_path ${MODEL_PATH}/vocab.txt \
   --label_map ${TASK_DATA_PATH}/xnli/label_map.json \
+   --vocab_path ${MODEL_PATH}/vocab.txt \
   --ernie_config_path ${MODEL_PATH}/ernie_config.json \
+   --init_pretraining_params ${MODEL_PATH}/params \
   --checkpoints ./checkpoints \
   --save_steps 1000 \
   --weight_decay  0.01 \
   --warmup_proportion 0.0 \
-                   --validation_steps 25 \
+   --use_fp16 false \
+   --validation_steps 100 \
   --epoch 3 \
   --max_seq_len 512 \
   --learning_rate 1e-4 \
@@ -30,3 +37,4 @@ python -u run_classifier.py \
   --num_iteration_per_drop_scope 1 \
   --num_labels 3 \
   --random_seed 1
--- a/script/zh_task/ernie_large/run_ChnSentiCorp.sh
+++ b/script/zh_task/ernie_large/run_ChnSentiCorp.sh
@@ -3,7 +3,9 @@ set -eux
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0
-python -u run_classifier.py \
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python -u ./ernie/run_classifier.py \
                   --use_cuda true \
                   --verbose true \
                   --do_train true \

--- a/script/zh_task/ernie_large/run_bq.sh
+++ b/script/zh_task/ernie_large/run_bq.sh
@@ -3,7 +3,8 @@ set -eux
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0
-python -u ./run_classifier.py \
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python -u ./ernie/run_classifier.py \
                     --use_cuda true \
                     --verbose true \
                     --do_train true \

--- a/script/zh_task/ernie_large/run_cmrc2018.sh
+++ b/script/zh_task/ernie_large/run_cmrc2018.sh
@@ -4,7 +4,13 @@ export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-python -u run_mrc.py --use_cuda true\
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python ./ernie/finetune_launch.py  \
+    --nproc_per_node 8 \
+    --selected_gpus 0,1,2,3,4,5,6,7 \
+    --node_ips $(hostname -i) \
+    --node_id 0 \
+./ernie/run_mrc.py --use_cuda true\
                    --batch_size 8 \
                    --in_tokens false\
                    --use_fast_executor true \

--- a/script/zh_task/ernie_large/run_dbqa.sh
+++ b/script/zh_task/ernie_large/run_dbqa.sh
@@ -3,7 +3,13 @@ set -eux
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-python -u run_classifier.py \
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python ./ernie/finetune_launch.py  \
+    --nproc_per_node 8 \
+    --selected_gpus 0,1,2,3,4,5,6,7 \
+    --node_ips $(hostname -i) \
+    --node_id 0 \
+./ernie/run_classifier.py \
                   --use_cuda true \
                   --verbose true \
                   --do_train true \

--- a/script/zh_task/ernie_large/run_drcd.sh
+++ b/script/zh_task/ernie_large/run_drcd.sh
@@ -4,7 +4,13 @@ export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-python -u run_mrc.py --use_cuda true\
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python ./ernie/finetune_launch.py  \
+    --nproc_per_node 8 \
+    --selected_gpus 0,1,2,3,4,5,6,7 \
+    --node_ips $(hostname -i) \
+    --node_id 0 \
+./ernie/run_mrc.py --use_cuda true\
                    --batch_size 8 \
                    --in_tokens false\
                    --use_fast_executor true \

--- a/script/zh_task/ernie_large/run_lcqmc.sh
+++ b/script/zh_task/ernie_large/run_lcqmc.sh
@@ -3,7 +3,8 @@ set -eux
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0
-python -u run_classifier.py \
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python -u ./ernie/run_classifier.py \
                   --use_cuda true \
                   --verbose true \
                   --do_train true \

--- a/script/zh_task/ernie_large/run_msra_ner.sh
+++ b/script/zh_task/ernie_large/run_msra_ner.sh
@@ -3,7 +3,8 @@ set -eux
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0
-python -u run_sequence_labeling.py \
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python -u ./ernie/run_sequence_labeling.py \
                   --use_cuda true \
                   --do_train true \
                   --do_val true \
@@ -14,15 +15,16 @@ python -u run_sequence_labeling.py \
                   --chunk_scheme "IOB" \
                   --label_map_config ${TASK_DATA_PATH}/msra_ner/label_map.json \
                   --train_set ${TASK_DATA_PATH}/msra_ner/train.tsv \
-                   --dev_set ${TASK_DATA_PATH}/msra_ner/dev.tsv \
+   		   --dev_set ${TASK_DATA_PATH}/msra_ner/dev.tsv,${TASK_DATA_PATH}/msra_ner/test.tsv \
                   --test_set ${TASK_DATA_PATH}/msra_ner/test.tsv \
-                   --vocab_path config/vocab.txt \
+                   --vocab_path ${MODEL_PATH}/vocab.txt \
-                   --ernie_config_path config/ernie_config.json \
+                   --ernie_config_path ${MODEL_PATH}/ernie_config.json \
                   --checkpoints ./checkpoints \
                   --save_steps 100000 \
                   --weight_decay  0.01 \
                   --warmup_proportion 0.0 \
                   --validation_steps 100 \
+                   --use_fp16 false \
                   --epoch 6 \
                   --max_seq_len 256 \
                   --learning_rate 1e-5 \

--- a/script/zh_task/ernie_large/run_thuc.sh
+++ b/script/zh_task/ernie_large/run_thuc.sh
@@ -3,7 +3,8 @@ set -eux
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-python -u run_classifier.py \
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python -u ./ernie/run_classifier.py \
                   --use_cuda true \
                   --do_train true \
                   --do_val true \

--- a/script/zh_task/ernie_large/run_xnli.sh
+++ b/script/zh_task/ernie_large/run_xnli.sh
@@ -3,7 +3,13 @@ set -eux
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-python -u run_classifier.py \
+export PYTHONPATH=./ernie:${PYTHONPATH:-}
+python ./ernie/finetune_launch.py  \
+    --nproc_per_node 8 \
+    --selected_gpus 0,1,2,3,4,5,6,7 \
+    --node_ips $(hostname -i) \
+    --node_id 0 \
+./ernie/run_classifier.py \
                   --use_cuda true \
                   --do_train true \
                   --do_val true \

--- a/script/zh_task/pretrain.sh
+++ b/script/zh_task/pretrain.sh
@@ -3,8 +3,12 @@ set -eux
 export FLAGS_eager_delete_tensor_gb=0
 export FLAGS_sync_nccl_allreduce=1
 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+python ./ernie/pretrain_launch.py  \
-python -u ./train.py --use_cuda True \
+    --nproc_per_node 8 \
+    --selected_gpus 0,1,2,3,4,5,6,7 \
+    --node_ips $(hostname -i) \
+    --node_id 0 \
+./ernie/train.py --use_cuda True \
                --is_distributed False\
                --use_fast_executor True \
                --weight_sharing True \
@@ -19,6 +23,7 @@ python -u ./train.py --use_cuda True \
                --save_steps 10000 \
                --ernie_config_path ./config/ernie_config.json \
                --learning_rate 1e-4 \
+                --use_fp16 false \
                --weight_decay 0.01 \
                --max_seq_len 512 \
                --skip_steps 10
--- a/utils/__init__.py
+++ b/utils/__init__.py
--- a/utils/fp16.py
+++ b/utils/fp16.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-import paddle
-import paddle.fluid as fluid
-def cast_fp16_to_fp32(i, o, prog):
-    prog.global_block().append_op(
-        type="cast",
-        inputs={"X": i},
-        outputs={"Out": o},
-        attrs={
-            "in_dtype": fluid.core.VarDesc.VarType.FP16,
-            "out_dtype": fluid.core.VarDesc.VarType.FP32
-        })
-def cast_fp32_to_fp16(i, o, prog):
-    prog.global_block().append_op(
-        type="cast",
-        inputs={"X": i},
-        outputs={"Out": o},
-        attrs={
-            "in_dtype": fluid.core.VarDesc.VarType.FP32,
-            "out_dtype": fluid.core.VarDesc.VarType.FP16
-        })
-def copy_to_master_param(p, block):
-    v = block.vars.get(p.name, None)
-    if v is None:
-        raise ValueError("no param name %s found!" % p.name)
-    new_p = fluid.framework.Parameter(
-        block=block,
-        shape=v.shape,
-        dtype=fluid.core.VarDesc.VarType.FP32,
-        type=v.type,
-        lod_level=v.lod_level,
-        stop_gradient=p.stop_gradient,
-        trainable=p.trainable,
-        optimize_attr=p.optimize_attr,
-        regularizer=p.regularizer,
-        gradient_clip_attr=p.gradient_clip_attr,
-        error_clip=p.error_clip,
-        name=v.name + ".master")
-    return new_p
-def create_master_params_grads(params_grads, main_prog, startup_prog,
-                               loss_scaling):
-    master_params_grads = []
-    tmp_role = main_prog._current_role
-    OpRole = fluid.core.op_proto_and_checker_maker.OpRole
-    main_prog._current_role = OpRole.Backward
-    for p, g in params_grads:
-        # create master parameters
-        master_param = copy_to_master_param(p, main_prog.global_block())
-        startup_master_param = startup_prog.global_block()._clone_variable(
-            master_param)
-        startup_p = startup_prog.global_block().var(p.name)
-        cast_fp16_to_fp32(startup_p, startup_master_param, startup_prog)
-        # cast fp16 gradients to fp32 before apply gradients
-        if g.name.find("layer_norm") > -1:
-            if loss_scaling > 1:
-                scaled_g = g / float(loss_scaling)
-            else:
-                scaled_g = g
-            master_params_grads.append([p, scaled_g])
-            continue
-        master_grad = fluid.layers.cast(g, "float32")
-        if loss_scaling > 1:
-            master_grad = master_grad / float(loss_scaling)
-        master_params_grads.append([master_param, master_grad])
-    main_prog._current_role = tmp_role
-    return master_params_grads
-def master_param_to_train_param(master_params_grads, params_grads, main_prog):
-    for idx, m_p_g in enumerate(master_params_grads):
-        train_p, _ = params_grads[idx]
-        if train_p.name.find("layer_norm") > -1:
-            continue
-        with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]):
-            cast_fp32_to_fp16(m_p_g[0], train_p, main_prog)