test=develop[

f6c68c85 · xixiaoyao · ee949ea1 · ee949ea1 · f6c68c85 · ee949ea1
80 changed file
--- a/LICENSE
+++ b/LICENSE
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-   1. Definitions.
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-   END OF TERMS AND CONDITIONS
-   APPENDIX: How to apply the Apache License to your work.
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-   Copyright [yyyy] [name of copyright owner]
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-       http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--- a/README.md
+++ b/README.md
-PALM
-===
-PALM (PAddLe for Multi-task) 是一个灵活易用的多任务学习框架，框架中内置了丰富的模型backbone（BERT、ERNIE等）、常见的任务范式（分类、匹配、序列标注、机器阅读理解等）和数据集读取与处理工具。对于典型的任务场景，用户几乎无需书写代码便可完成新任务的添加；对于特殊的任务场景，用户可通过对预置接口的实现来完成对新任务的支持。
-## 安装
+# 多任务学习框架PaddlePALM
-目前仅支持git clone源码的方式使用:
+# 安装
-```shell
+pip install paddlepalm
-git clone https://github.com/PaddlePaddle/PALM.git
-```
-**环境依赖**
- Python >= 2.7
- cuda >= 9.0
- cudnn >= 7.0
- PaddlePaddle >= 1.6 (请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装)
-## 目录结构
- backbone: 多任务学习的主干网络表示，支持bert, ernie等，用户可自定义添加
- config：存放各个任务实例的配置文件，用户添加任务时需在此建立该任务的配置文件
- data: 存放各个任务的数据集
- pretrain_model: 存放预训练模型、字典及其相关配置
- optimizer: 优化器，用户可在此自定义优化器
- reader: 各个任务的数据读取与处理模块以及做reader融合的joint_reader文件
- paradigm: 任务输出层相关网络结构描述
- utils: 通用工具函数文件
- mtl_run.py: 多任务学习的主要流程描述
- run.sh: 多任务学习启动脚本
-## 使用说明
-框架给出了三个添加完成的任务示例：*Machine Reading Comprehension*、*Mask Language Model*和*Question Answer Matching*。其中在`mtl_config.yaml`中将*Machine Reading Comprehension*设置为了主任务，其他为辅助任务，用户可通过如下命令启动多任务学习
+# 使用
-```shell
+### 1. 创建任务实例
-bash run.sh
-```
-*提示：首次运行时，脚本会自动下载预训练的bert和ernie模型，请耐心等待*
-### 多任务学习配置
+使用yaml格式描述任务实例，每个任务实例中的必选字段包括
-在`mtl_config.yaml`中完成对多任务训练和推理的主配置，配置包含如下
+- train_file: 训练集文件路径
+- reader: 数据集载入与处理工具名，框架预置reader列表见[这里](https://www.baidu.com/)
+- backbone: 骨架模型名，框架预置reader列表见[这里](https://www.baidu.com/)
+- paradigm: 任务范式(类型)名，框架预置paradigm列表见[这里](https://www.baidu.com/)
-***必选字段***
+### 2. 完成训练配置
- main_task：*(str)* 指定主任务的名称，目前仅支持单个主任务。名称选取自`config`文件夹中的配置的文件名（不包含后缀`.yaml`和为task共享而设置的中间后缀）
+使用yaml格式完成配置多任务学习中的相关参数，如指定任务实例及其相关的主辅关系、参数复用关系、采样权重等
- auxiliary_task：*(str)* 指定辅助任务，支持多个辅助任务，辅助任务之间使用空格隔开。名称选取自`config`文件夹中的配置的文件名（不包含后缀`.yaml`和为task共享而设置的中间后缀）
- do_train：*(bool)* 训练标志位
- do_predict：*(bool)* 预测标志位，目前仅支持对主任务进行预测
- checkpoint_path: *(str)* 模型保存、训练断点恢复和预测模型载入路径，从该路径载入模型时默认读取最后一个训练step的模型
- backbone_model：*(str)* 使用的骨干网络，名称选取自`backbone`目录下的模块。注意，更换backbone时，若使用预训练模型，应同步更换预训练模型参数、配置和字典等相关字段
- vocab_path：*(str)* 字典文件，纯文本格式存储，其中每行为一个单词
- optimizer：*(str)* 优化器名称，名称选取自`optimizer`中的文件名
- learning_rate：*(str)* 训练阶段的学习率
- skip_steps：*(int)* 训练阶段打印日志的频率（step为单位）
- epoch：*(int)* 主任务的训练epoch数
- use_cuda：*(bool)* 使用GPU训练的标志位
- warmup_proportion：*(float)* 预训练模型finetuning时的warmup比例
- use_ema：*(bool)* 是否开启ema进行训练和推理
- ema_decay：*(float)* 开启ema时的衰减指数
- random_seed：*(int)* 随机种子
- use_fp16：*(bool)* 开启混合精度训练标志位
- loss_scaling：*(float)* 开启混合精度训练时的loss缩放因子
-***可选字段***
+### 3. 开始训练
- pretrain_model_path：*(str)* 预训练模型的载入路径，该路径下应包含存储模型参数的params文件夹
+```python
- pretrain_config_path：*(str)* 预训练模型的配置文件，json格式描述
- do_lower_case：*(bool)* 预处理阶段是否区分大小写
- 其他用户自定义字段
+import paddlepalm as palm
-### 使用示例
+if __name__ == '__main__':
-若内置任务可满足用户需求，或用户已经完成自定义任务的添加，可通过如下方式直接启动多任务学习。
+    controller = palm.Controller('config.yaml', task_dir='task_instance')
+    controller.load_pretrain('pretrain_model/ernie/params')
+    controller.train()
+```
-例如，框架中内置了一个小数据集，包含MRQA阅读理解评测数据`mrqa`、MaskLM训练数据`mlm4mrqa`和问题与答案所在上下文的匹配数据集`am4mrqa`，而在框架中已经内置了机器阅读理解任务(`reading_comprehension`)、掩码语言模型任务（`mask_language_model`)和问答匹配任务（`answer_matching`）。这里我们希望用掩码语言模型和问答匹配任务来提升机器阅读理解任务的效果，那么我们可通过如下流程完成多任务学习的启动。
+### 4. 预测
-首先在config文件夹中添加训练任务相关的配置文件：
+用户可在训练结束后直接调用pred接口对某个目标任务进行预测
-1. `reading_comprehension.yaml`
+示例：
 ```python
-train_file: "data/mrqa/mrqa-combined.train.raw.json"
+import paddlepalm as palm
-predict_file: "data/mrqa/mrqa-combined.dev.raw.json"
-batch_size: 4
-mix_ratio: 1.0
-in_tokens: false
-doc_stride: 128
-sample_rate: 0.02
-...
-```
-2. `mask_language_model.yaml`
+if __name__ == '__main__':
-```python
+    controller = palm.Controller(config_path='config.yaml', task_dir='task_instance')
-train_file: "data/mlm4mrqa"
+    controller.load_pretrain('pretrain_model/ernie/params')
-mix_ratio: 0.4
+    controller.train()
-batch_size: 4
+    controller.pred('mrqa')
-in_tokens: False
-generate_neg_sample: False
 ```
-3. `answer_matching.yaml`
+也可新建controller直接预测
-```python
-train_file: "data/am4mrqa/train.txt" 
-mix_ratio: 0.4
-batch_size: 4
-in_tokens: False
-```
-而后可以在主配置文件`mtl_config.yaml`中完成多任务学习的配置，其中，使用`main_task`字段指定主任务，使用`auxilary_task`可指定辅助任务，多个辅助任务之间使用空格"` `"隔开。
-epoch的设定仅针对设定为主任务有效，`mix ratio`的基准值1.0也是针对主任务的训练步数而言的。例如，对于`epoch=2`，若将`reading_comprehension`任务的`mix ratio`设定为1.0，`mask_language_model`的`mix ratio`设定为0.5，那么`reading_comprehension`任务将训练两个完整的`epoch`，而`mask_language_model`任务的训练步数等于`reading_comprehension`训练步数的一半。
 ```python
-main_task: "reading_comprehension"
+import paddlepalm as palm
-auxiliary_task: "mask_language_model answer_matching"
-do_train: True
+if __name__ == '__main__':
-epoch: 2
+    controller = palm.Controller(config_path='config.yaml', task_dir='task_instance')
-...
+    controller.pred('mrqa', infermodel_path='output_model/firstrun2/infer_model')
 ```
-最后，运行`run.sh`启动三个任务的联合训练。若用户希望删掉其中某些辅助任务，可通过修改`mtl_config.yaml`中的`auxilary_task`字段来实现。
-### 添加新任务
-用户添加任务时，在准备好该任务的数据集后，需要完成如下3处开发工作：
-***config模块***
-位于`./config`目录。存放各个任务实例的配置文件，使用`yaml`格式描述。配置文件中的必选字段包括
- batch_size：每个训练或推理step所使用样本数。当`in_tokens`为True时，`batch_size`表示每个step所包含的tokens数量。
- in_tokens：是否使用lod tensor的方式构造batch，当`in_tokens`为False时，使用padding方式构造batch。
-训练阶段包含的必选字段包括
- train_file：训练集文件路径
- mix_ratio：该任务的训练阶段采样权重（1.0代表与主任务采样次数的期望相同）
-推理阶段包含的必选字段包括
- predict_file：测试集文件路径
-此外用户可根据任务需要，自行定义其他超参数，该超参可在创建任务模型时被访问
-***reader模块***
-位于`./reader`目录下。完成数据集读取与处理。新增的reader应放置在`paradigm`目录下，且包含一个`get_input_shape`方法和`DataProcessor`类。
- **get_input_shape**: *(function)*  定义reader给backbone和task_paradigm生成的数据的shape和dtype，且需要同时返回训练和推理阶段的定义。
-  - 输入参数
-    - args: *(dict)* 解析后的任务配置
-  - 返回值
-    - train_input_shape: *(dict)* 包含backbone和task两个key，每个key对应的value为一个list，存储若干`(shape, dtype)`的元组
-    - test_input_shape: *(dict)* 包含backbone和task两个key，每个key对应的value为一个list，存储若干`(shape, dtype)`的元组
- **DataProcessor**：*(class)*   定义数据集的载入、预处理和遍历
-  - \_\_init\_\_: 构造函数，解析和存储相关参数，进行必要的初始化
-    - 输入参数
-      - args: *(dict)* 解析后的任务配置
-    - 返回值
-      - 无
-  - data_generator: *(function)* 数据集的迭代器，被遍历时每次yield一个batch
-    - 输入参数
-      - phase: *(str)* 任务所处阶段，支持训练`train`和推理`predict`两种可选阶段
-      - shuffle: *(bool)* 训练阶段是否进行数据集打乱
-      - dev_count: *(int)* 可用的GPU数量或CPU数量
-    - yield输出
-      - tensors: (list) 根据`get_input_shape`中定义的任务backbone和task的所需输入shape和类型，来yield相应list结构的数据。其中被yield出的list的头部元素为backbone要求的输入数据，后续元素为task要求的输入数据
-  - get_num_examples: *(function)* 返回样本数。注意由于滑动窗口等机制，实际运行时产生的样本数可能多于数据集中的样本数，这时应返回runtime阶段实际样本数
-    - 输入参数
-      - 无
-    - 返回值
-      - num_examples: *(int)* 样本数量
-***task_paradigm模块***
-位于`./paradigm`目录下。描述任务范式（如分类、匹配、阅读理解等）。新增的任务范式应放置在`paradigm`目录下，且应包含`compute_loss`和`create_model`两个必选方法，以及`postprocess`，`global_postprocess`两个可选方法。
- create_model：*(function)* 创建task模型
-  - 输入参数
-    - reader_input：*(nested Variables)* 数据输入层的输出，定义位于该任务的reader模块的`input_shape`方法中。输入的前N个元素为backbone的输入元素，之后的元素为task的输入。
-    - base_model：*(Model)* 模型backbone的实例，可调用backbone的对外输出接口来实现task与backbone的连接。一般来说，backbone的输出接口最少包括`final_sentence_representation`和`final_word_representation`两个属性。
-      - base_model.final_sentence_representation：*(Variable)* 输入文本的向量表示，shape为`[batch_size, hidden_size]`。
-      - base_model.final_word_representation：*(Variable)* 输入文本中每个单词的向量表示，shape为`[batch_size, max_seqlen, hidden_size]`
-    - is_training：*(bool)* 训练标志位
-    - args：*(Argument)* 任务相关的参数配置，具体参数在config文件夹中定义
-  - 返回值
-    - output_tensors: *(dict)* 任务输出的tensor字典。训练阶段的输出字典中应至少包括num_seqs元素，num_seqs记录了batch中所包含的样本数（在输入为lod tensor（args.in_tokens被设置为True）时所以样本压平打平，没有样本数维度）
- compute_loss: *(function)* 计算task在训练阶段的batch平均损失值
-  - 输入参数
-    - output_tensors: *(dict)* 创建task时（调用`create_model`）时返回值，存储计算loss所需的Variables的名字到实例的映射
-    - args：*(Argument)* 任务相关的参数配置，具体参数在config文件夹中定义
-  - 返回值
-    - total_loss：*(Variable)* 当前batch的平均损失值
- postprocess：*(function)* 推理阶段对每个推理step得到的fetch_results进行的后处理，返回对该step的每个样本的后处理结果
-  - 输入参数
-    - fetch_results：(dict) 当前推理step的fetch_dict中的计算结果，其中fetch_dict在create_model时定义并返回。
-  - 返回值
-    - processed_results：(list)当前推理step所有样本的后处理结果。
- global_postprocess: *(function)* 推理结束后，对全部样本的后处理结果进行最终处理（如结果保存、二次后处理等）
-  - 输入参数
-    - pred_buf：所有测试集样本的预测后处理结果
-    - processor：任务的数据集载入与处理类DataProcessor的实例
-    - mtl_args：多任务学习配置，在`mtl_conf.yaml`中定义
-    - task_args：任务相关的参数配置，在`conf`文件夹中定义
-  - 返回值
-    - 无
-***命名规范***
-为新任务创建config，task_paradigm和reader文件后，应将三者文件名统一，且为reader文件的文件名增加`_reader`后缀。例如，用户添加的新任务名为yelp_senti，则config文件名为`yelp_senti.yaml`，放置于config文件夹下；task_paradigm文件名为`yelp_senti.py`，放置于paradigm文件夹下；reader文件名为`yelp_senti_reader.py`，放置于reader文件夹下。
-***One-to-One模式（任务层共享）***
-框架默认使用one-to-many的模式进行多任务训练，即多任务共享encoder，不共享输出层。该版本同时支持one-to-one模式，即多任务同时共享encoder和输出层（模型参数完全共享，但是有不同的数据源）。该模式通过config文件命名的方式开启，具体流程如下。
-```
+# 运行机制
-1. mtl_config.yaml下用户配置任务相关的名称，如main_task: "reading_comprehension"
-2. 如果一个任务的数据集是多个来源，请在configs下对同一个任务添加多个任务配置，如任务为"reading_comprehension"有两个数据集需要训练，且每个batch内的数据都来自同一数据集，则需要添加reading_comprehension.name1.yaml和reading_comprehension.name2.yaml两个配置文件，其中name1和name2用户可根据自己需求定义名称，框架内不限定名称定义；
+### 多任务学习机制
-3. 启动多任务学习：sh run.sh
+pass 
-```
-## 框架结构与运行原理
+### 训练终止机制
-框架结构如图所示
-![框架图](https://tva1.sinaimg.cn/large/006y8mN6ly1g7goo0bjzwj31c20om13h.jpg)
+- 默认的设置：
+  - **所有target任务达到目标训练步数后多任务学习停止**
+  - 未设置成target任务的任务（即辅助任务）不会影响训练终止与否，只是担任”陪训“的角色
+  - 注：默认所有的任务都是target任务，用户可以通过`target_tag`来标记目标/辅助任务
+  - 每个目标任务的目标训练步数由num_epochs和mix_ratio计算得到
-其中`mtl_config.yaml`用于配置多任务主控的参数设定，每个任务实例的配置由用户完成后放置于`config`文件夹中。当用户运行`run.sh`后，脚本启动多任务学习控制器，控制器开始解析`mtl_config.yaml`和各个任务实例的配置文件，进而创建backbone、为各个任务创建reader和任务层，最后控制器启动训练任务，实现多任务训练。
+### 保存机制
-## License
+- 默认的设置：
-This tutorial is contributed by [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) and licensed under the [Apache-2.0 license](https://github.com/PaddlePaddle/models/blob/develop/LICENSE).
+  - 训练过程中，保存下来的模型分为checkpoint (ckpt)和inference model (infermodel)两种：
+    - ckpt保存的是包含所有任务的总计算图（即整个多任务学习计算图），用于训练中断恢复
+    - infermodel保存的是某个目标任务的推理计算图和推理依赖的相关配置
+  - 对于每个target任务，训练到预期的步数后自动保存inference model，之后不再保存。（注：保存inference model不影响ckpt的保存）
+- 用户可改配置
+  - 使用`save_ckpt_every_steps`来控制保存ckpt的频率，默认不保存
+  - 每个task instance均可使用`save_infermodel_every_steps`来控制该task保存infermodel的频率，默认为-1，即只在达到目标训练步数时保存一下
-## 许可证书
-此向导由[PaddlePaddle](https://github.com/PaddlePaddle/Paddle)贡献，受[Apache-2.0 license](https://github.com/PaddlePaddle/models/blob/develop/LICENSE)许可认证。
--- a/backbone/README.md
+++ b/backbone/README.md
-该目录用来存放模型backbone，用户可通过实现backbone的接口完成自定义。
--- a/backbone/utils/transformer.py
+++ b/backbone/utils/transformer.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Transformer encoder."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from functools import partial
-from functools import reduce
-import numpy as np
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-from paddle.fluid.layer_helper import LayerHelper
-def layer_norm(x, begin_norm_axis=1, epsilon=1e-6, param_attr=None, bias_attr=None):
-    helper = LayerHelper('layer_norm', **locals())
-    mean = layers.reduce_mean(x, dim=begin_norm_axis, keep_dim=True)
-    shift_x = layers.elementwise_sub(x=x, y=mean, axis=0)
-    variance = layers.reduce_mean(layers.square(shift_x), dim=begin_norm_axis, keep_dim=True)
-    r_stdev = layers.rsqrt(variance + epsilon)
-    norm_x = layers.elementwise_mul(x=shift_x, y=r_stdev, axis=0)
-    param_shape = [reduce(lambda x, y: x * y, norm_x.shape[begin_norm_axis:])]
-    param_dtype = norm_x.dtype
-    scale = helper.create_parameter(
-        attr=param_attr,
-        shape=param_shape,
-        dtype=param_dtype,
-        default_initializer=fluid.initializer.Constant(1.))
-    bias = helper.create_parameter(
-        attr=bias_attr,
-        shape=param_shape,
-        dtype=param_dtype,
-        is_bias=True,
-        default_initializer=fluid.initializer.Constant(0.))
-    out = layers.elementwise_mul(x=norm_x, y=scale, axis=-1)
-    out = layers.elementwise_add(x=out, y=bias, axis=-1)
-    return out
-def multi_head_attention(queries,
-                         keys,
-                         values,
-                         attn_bias,
-                         d_key,
-                         d_value,
-                         d_model,
-                         n_head=1,
-                         dropout_rate=0.,
-                         cache=None,
-                         param_initializer=None,
-                         name='multi_head_att'):
-    """
-    Multi-Head Attention. Note that attn_bias is added to the logit before
-    computing softmax activiation to mask certain selected positions so that
-    they will not considered in attention weights.
-    """
-    keys = queries if keys is None else keys
-    values = keys if values is None else values
-    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
-        raise ValueError(
-            "Inputs: quries, keys and values should all be 3-D tensors.")
-    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
-        """
-        Add linear projection to queries, keys, and values.
-        """
-        q = layers.fc(input = queries,
-                      size = d_key * n_head,
-                      num_flatten_dims = 2,
-                      param_attr = fluid.ParamAttr(
-                          name = name + '_query_fc.w_0',
-                          initializer = param_initializer),
-                      bias_attr = name + '_query_fc.b_0')
-        k = layers.fc(input = keys,
-                      size = d_key * n_head,
-                      num_flatten_dims = 2,
-                      param_attr = fluid.ParamAttr(
-                          name = name + '_key_fc.w_0',
-                          initializer = param_initializer),
-                      bias_attr = name + '_key_fc.b_0')
-        v = layers.fc(input = values,
-                      size = d_value * n_head,
-                      num_flatten_dims = 2,
-                      param_attr = fluid.ParamAttr(
-                          name = name + '_value_fc.w_0',
-                          initializer = param_initializer),
-                      bias_attr = name + '_value_fc.b_0')
-        return q, k, v
-    def __split_heads(x, n_head):
-        """
-        Reshape the last dimension of inpunt tensor x so that it becomes two
-        dimensions and then transpose. Specifically, input a tensor with shape
-        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
-        with shape [bs, n_head, max_sequence_length, hidden_dim].
-        """
-        hidden_size = x.shape[-1]
-        # The value 0 in shape attr means copying the corresponding dimension
-        # size of the input as the output dimension size.
-        reshaped = layers.reshape(
-            x = x, shape = [0, 0, n_head, hidden_size // n_head], inplace=False)
-        # permuate the dimensions into:
-        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
-        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
-    def __combine_heads(x):
-        """
-        Transpose and then reshape the last two dimensions of inpunt tensor x
-        so that it becomes one dimension, which is reverse to __split_heads.
-        """
-        if len(x.shape) == 3: return x
-        if len(x.shape) != 4:
-            raise ValueError("Input(x) should be a 4-D Tensor.")
-        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
-        # The value 0 in shape attr means copying the corresponding dimension
-        # size of the input as the output dimension size.
-        return layers.reshape(
-            x = trans_x,
-            shape = [0, 0, trans_x.shape[2] * trans_x.shape[3]],
-            inplace = False)
-    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
-        """
-        Scaled Dot-Product Attention
-        """
-        scaled_q = layers.scale(x = q, scale = d_key**-0.5)
-        product = layers.matmul(x = scaled_q, y = k, transpose_y = True)
-        if attn_bias:
-            product += attn_bias
-        weights = layers.softmax(product)
-        if dropout_rate:
-            weights = layers.dropout(
-                weights,
-                dropout_prob=dropout_rate,
-                dropout_implementation="upscale_in_train",
-                is_test=False)
-        out = layers.matmul(weights, v)
-        return out
-    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
-    if cache is not None:  # use cache and concat time steps
-        # Since the inplace reshape in __split_heads changes the shape of k and
-        # v, which is the cache input for next time step, reshape the cache
-        # input from the previous time step first.
-        k = cache["k"] = layers.concat(
-            [layers.reshape(
-                cache["k"], shape=[0, 0, d_model]), k], axis=1)
-        v = cache["v"] = layers.concat(
-            [layers.reshape(
-                cache["v"], shape=[0, 0, d_model]), v], axis=1)
-    q = __split_heads(q, n_head)
-    k = __split_heads(k, n_head)
-    v = __split_heads(v, n_head)
-    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key,
-                                                  dropout_rate)
-    out = __combine_heads(ctx_multiheads)
-    # Project back to the model size.
-    proj_out = layers.fc(input = out,
-                         size = d_model,
-                         num_flatten_dims = 2,
-                         param_attr=fluid.ParamAttr(
-                             name = name + '_output_fc.w_0',
-                             initializer = param_initializer),
-                         bias_attr = name + '_output_fc.b_0')
-    return proj_out
-def positionwise_feed_forward(x,
-                              d_inner_hid,
-                              d_hid,
-                              dropout_rate,
-                              hidden_act,
-                              param_initializer=None,
-                              name='ffn'):
-    """
-    Position-wise Feed-Forward Networks.
-    This module consists of two linear transformations with a ReLU activation
-    in between, which is applied to each position separately and identically.
-    """
-    hidden = layers.fc(input=x,
-                       size=d_inner_hid,
-                       num_flatten_dims=2,
-                       act=hidden_act,
-                       param_attr=fluid.ParamAttr(
-                           name=name + '_fc_0.w_0',
-                           initializer=param_initializer),
-                       bias_attr=name + '_fc_0.b_0')
-    if dropout_rate:
-        hidden = layers.dropout(
-            hidden,
-            dropout_prob=dropout_rate,
-            dropout_implementation="upscale_in_train",
-            is_test = False)
-    out = layers.fc(input = hidden,
-                    size = d_hid,
-                    num_flatten_dims = 2,
-                    param_attr=fluid.ParamAttr(
-                        name = name + '_fc_1.w_0', 
-                        initializer = param_initializer),
-                    bias_attr = name + '_fc_1.b_0')
-    return out
-def pre_post_process_layer(prev_out, out, process_cmd, dropout_rate=0.,
-                           name=''):
-    """
-    Add residual connection, layer normalization and droput to the out tensor
-    optionally according to the value of process_cmd.
-    This will be used before or after multi-head attention and position-wise
-    feed-forward networks.
-    """
-    for cmd in process_cmd:
-        if cmd == "a":  # add residual connection
-            out = out + prev_out if prev_out else out
-        elif cmd == "n":  # add layer normalization
-            out_dtype = out.dtype
-            if out_dtype == fluid.core.VarDesc.VarType.FP16:
-                out = layers.cast(x = out, dtype = "float32")
-            out = layer_norm(
-                out,
-                begin_norm_axis=len(out.shape) - 1,
-                param_attr=fluid.ParamAttr(
-                    name = name + '_layer_norm_scale',
-                    initializer = fluid.initializer.Constant(1.)),
-                bias_attr=fluid.ParamAttr(
-                    name = name + '_layer_norm_bias',
-                    initializer = fluid.initializer.Constant(0.)))
-            if out_dtype == fluid.core.VarDesc.VarType.FP16:
-                out = layers.cast(x = out, dtype = "float16")
-        elif cmd == "d":  # add dropout
-            if dropout_rate:
-                out = layers.dropout(
-                    out,
-                    dropout_prob = dropout_rate,
-                    dropout_implementation = "upscale_in_train",
-                    is_test = False)
-    return out
-pre_process_layer = partial(pre_post_process_layer, None)
-post_process_layer = pre_post_process_layer
-def encoder_layer(enc_input,
-                  attn_bias,
-                  n_head,
-                  d_key,
-                  d_value,
-                  d_model,
-                  d_inner_hid,
-                  prepostprocess_dropout,
-                  attention_dropout,
-                  relu_dropout,
-                  hidden_act,
-                  preprocess_cmd="n",
-                  postprocess_cmd="da",
-                  param_initializer=None,
-                  name=''):
-    """The encoder layers that can be stacked to form a deep encoder.
-    This module consits of a multi-head (self) attention followed by
-    position-wise feed-forward networks and both the two components companied
-    with the post_process_layer to add residual connection, layer normalization
-    and droput.
-    """
-    attn_output = multi_head_attention(
-        pre_process_layer(
-            enc_input,
-            preprocess_cmd,
-            prepostprocess_dropout,
-            name=name + '_pre_att'),
-        None,
-        None,
-        attn_bias,
-        d_key,
-        d_value,
-        d_model,
-        n_head,
-        attention_dropout,
-        param_initializer = param_initializer,
-        name = name + '_multi_head_att')
-    attn_output = post_process_layer(
-        enc_input,
-        attn_output,
-        postprocess_cmd,
-        prepostprocess_dropout,
-        name = name + '_post_att')
-    ffd_output = positionwise_feed_forward(
-        pre_process_layer(
-            attn_output,
-            preprocess_cmd,
-            prepostprocess_dropout,
-            name = name + '_pre_ffn'),
-        d_inner_hid,
-        d_model,
-        relu_dropout,
-        hidden_act,
-        param_initializer = param_initializer,
-        name = name + '_ffn')
-    return post_process_layer(
-        attn_output,
-        ffd_output,
-        postprocess_cmd,
-        prepostprocess_dropout,
-        name = name + '_post_ffn')
-def encoder(enc_input,
-            attn_bias,
-            n_layer,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            hidden_act,
-            preprocess_cmd="n",
-            postprocess_cmd="da",
-            param_initializer=None,
-            name='',
-            return_all = False):
-    """
-    The encoder is composed of a stack of identical layers returned by calling
-    encoder_layer.
-    """
-    enc_outputs = []
-    for i in range(n_layer):
-        enc_output = encoder_layer(
-            enc_input,
-            attn_bias,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            hidden_act,
-            preprocess_cmd,
-            postprocess_cmd,
-            param_initializer = param_initializer,
-            name = name + '_layer_' + str(i))
-        enc_input = enc_output
-        if i < n_layer - 1:
-            enc_outputs.append(enc_output)
-    enc_output = pre_process_layer(
-        enc_output, preprocess_cmd, prepostprocess_dropout, name="post_encoder")
-    enc_outputs.append(enc_output)
-    if not return_all:
-        return enc_output
-    else:
-        return enc_output, enc_outputs
--- a/config.yaml
+++ b/config.yaml
+task_instance: "mrqa, match4mrqa"
+target_tag: 1, 0
+mix_ratio: 1.0, 0.5
+save_path: "output_model/firstrun"
+backbone: "ernie"
+backbone_config_path: "pretrain_model/ernie/ernie_config.json"
+vocab_path: "pretrain_model/ernie/vocab.txt"
+do_lower_case: True
+max_seq_len: 512
+batch_size: 5
+num_epochs: 2
+optimizer: "adam"
+learning_rate: 3e-5
+warmup_proportion: 0.1
+weight_decay: 0.1
--- a/config/answer_matching.yaml
+++ b/config/answer_matching.yaml
-train_file: "data/am4mrqa/train.txt"
-mix_ratio: 0.4
-batch_size: 4
-in_tokens: False
-generate_neg_sample: False
--- a/config/reading_comprehension.yaml
+++ b/config/reading_comprehension.yaml
-train_file: "data/mrqa/mrqa-combined.train.raw.json"
-predict_file: "data/mrqa/mrqa-combined.dev.raw.json"
-sample_rate: 0.02
-mix_ratio: 1.0
-batch_size: 4
-in_tokens: false
-doc_stride: 128
-with_negative: false
-max_query_length: 64
-max_answer_length: 30
-n_best_size: 20
-null_score_diff_threshold: 0.0
-verbose: False
--- a/data/am4mrqa/dev.txt
+++ b/data/am4mrqa/dev.txt
--- a/data/am4mrqa/train.txt
+++ b/data/am4mrqa/train.txt
--- a/data/match4mrqa/train.txt
+++ b/data/match4mrqa/train.txt
+label	text_a	text_b
+1	From what work of Durkheim's was interaction ritual theory derived?	Subsequent to these developments, Randall Collins (2004) formulated his interaction ritual theory by drawing on Durkheim's work on totemic rituals that was extended by Goffman (1964/2013; 1967) into everyday focused encounters. Based on interaction ritual theory, we experience different levels
+0	where is port au prince located in haiti	are near the water , while residential neighborhoods are located on the hills above . Its population is difficult to ascertain due to the rapid growth of slums in the hillsides above the
+0	What is the world’s first-ever pilsner type blond lager, the company also awarded the Master Homebrewer Competition held in San Francisco to an award-winning brewer who won the prestigious American Homebrewers Associations' Homebrewer of the Year award in 2013?	of the Year award in 2013, becoming the first woman in thirty years, and the first African American person ever to ever win the award. After an extensive career with the California State Legislature she began working for PicoBrew, a product development company in Seattle, WA that specializes in automated brewing equipment. In addition to
+0	the gakkel ridge is a boundary between which two tectonic plates	Mid-Atlantic Ridge ( MAR ) is a mid-ocean ridge , a divergent tectonic plate or constructive plate boundary located along the floor of the Atlantic Ocean , and part of the longest mountain range in the world . The ridge extends from a junction with the Gakkel Ridge ( Mid-Arctic Ridge ) northeast of Greenland southward to the Bouvet Triple Junction in the South Atlantic . Although the Mid-Atlantic Ridge is mostly an underwater
+1	what is the song new years day about	a song by Irish rock band U2 . It is on their 1983 album War and it was released as the album 's lead single in January 1983 . Written about the Polish Solidarity movement , `` New Year 's Day '' is driven by Adam Clayton 's distinctive bassline and the Edge 's piano and guitar playing . It was the band 's first UK hit
+0	Which is the biggest historical sites?	the Declaration of Independence was signed, and the Liberty Bell are the city's most famous attractions. Other historic sites include homes for Edgar Allan Poe, Betsy Ross, and Thaddeus Kosciuszko, early government buildings like the First and Second Banks of
+1	Encyclopedia of Afghan Jihad, is a manual of Jihad in eleven volumes, detailing how to make and use explosives and firearms, how to plan and carry out assassinations and other terrorist acts, and much more, it was found in the London residence of which Islamic cleric, who was the imam of Finsbury Park Mosque in London, England, where he preached Islamic fundamentalism and militant Islamism?	and much more. It was found in the London residence of Islamic cleric Sheikh Abu Hamza al-Masri in May 2004. During al-Masri's trial, the prosecution referred to the literature as a "blueprint for terror". [PAR] [TLE] Abu Hamza al-Masri [SEP] Mustafa Kamel Mustafa (Arabic: مصطفى كامل مصطفى‎ ‎ ; born 15 April 1958), also known as Abu Hamza al-Masri ( أبو حمزة المصري , "Abū Ḥamzah al-Maṣrī" – literally, the Egyptian father of Hamza), the Hook
+0	How can static analysis be useful with query languages?	the *Abstract interpretation framework has been extended to the field of query languages for relational databases as a way to support sound approximation techniques. The
+1	In what year did the Motion Picture Association of America introduce its rating system?	format being those only ones presented in color), depending upon the original content of movies, particularly movies released after the 1968 implementation of the Motion Picture Association of America's ratings system and the concurrent disestablishment of the Motion Picture Production Code.
+0	What was the windspeed once the typhoon weakened?	in north Luzon, due to high waters there related to the storm, the news agency said. Presidential spokesman Abigail Valte earlier Saturday urged residents of low-lying and mountainous areas that could be hit hard by the storm to evacuate, the state news agency said, citing an interview conducted on a government radio station. World Vision, the Christian humanitarian organization, said Saturday that it had to postpone some of its relief efforts due to Nalgae, with two of three emergency teams set to deploy once the storm passes. Another team is in Bulcan province, most of which is "still submerged" because of Nesat. The group is focusing its post-Nesat efforts on two communities in Manila and three in the northern Isabela and Zambales provinces. Vouchers are being distributed so people can buy needed items, some emergency supplies are being given out directly to citizens and 3,000 Manila children will receive school supplies. Sherbien Dacalanio, a CNN iReporter in the Philippines, described one area of Manila as being devastated by Nesat. "The damage
+1	What is the initial phantom body pain sensation experienced by people with spinal cord damage?	paraplegics, phantom body pain in areas of complete sensory loss. This phantom body pain is initially described as burning or tingling but may evolve into severe crushing or pinching pain, or the sensation of fire running down the legs or
+0	Sarah Etonge is a star in the sport that involves running over what landforms?	Sarah Etonge [SEP] Sarah Liengu Etonge (born in Buea, Southwest Province, Cameroon) is a current African mountain running star. She has won the annual Mount Cameroon Race of Hope seven times in her native Buea, in the
+0	Den of Thieves stars which Scottish actor?	of Thieves (film) [SEP] Den of Thieves is an upcoming American heist action thriller film directed by Christian Gudegast. The film stars Gerard Butler, 50 Cent, Pablo Schreiber, O'Shea Jackson Jr., Evan Jones, Dawn Olivieri, Mo McRae, and Max Holloway. After studying law, Butler turned to
+0	How long does it take for listeria to make a person sick?	Food and Drug Administration said it has teamed up with state officials in the effort. "FDA and its state partners are conducting checks at retail stores, wholesalers and distributors to make sure they have
+1	Steve Haworth did body modification on Dennis Avner, who underwent how many surgeries? 	1958 – November 5, 2012) was an American man known for his extensive body modifications, which were intended to increase his resemblance to a tigress. For his 14 surgical procedures towards that goal, he held a world record for "most permanent transformations to look like an animal." The name "Stalking Cat" is a Native American name,
+0	The Song, Your Mirror, is featured on Stars an, album by Simply Red that was released in what year?	on the US "Billboard" Hot 100. Written by Mick Hucknall, it was featured on the album "Stars" and reached number 17 on the UK chart when released in July
+1	What award did the animator of the semi-sequel spin-off to Kung Fu Panda receive?	work at DreamWorks Animation and for his award-winning student film "Le Building". He joined DreamWorks in 2008, after working on 2D animated films in France. The following year, Perifel won an Annie Award for the short film "Secrets of the Furious Five". He received subsequent nominations for "Kung Fu Panda 2" and "Rise of the Guardians".
+0	Who did Madonna turn to for comfort during her mother's illness?	siblings resented housekeepers and invariably rebelled against anyone brought into their home ostensibly to take the place of their beloved mother. Madonna later told Vanity
+0	What do people in Haiti use to contact relatives, friends in earthquake zones?	rumbled across Haiti on Tuesday evening. "It's 8:44 p.m. and we're still getting aftershocks! Can hear people gathered in the distance singing prayers," wrote Richard Morse, hotel manager at the Oloffson Hotel in the capital, Port-au-Prince. On Twitter, he captured the aftermath of a 7.0-magnitude earthquake that struck off the coast of Haiti, causing widespread destruction. The quake, which happened shortly before 5 p.m., was followed by at least 18 aftershocks, averaging a magnitude of 5.0, according to the U.S. Geological Survey. As the night went on, the singing and praying intensified and then waned, Morse wrote. What he didn't hear in all the commotion were helicopters or ambulances. Frantic callers broadcast pleas for help. "My cousin has a broken leg and she's trying to get to the hospital," one caller said. "She has two kids. ... How can she get to the hospital?" Are you there? Submit an iReport The man gave his cousin's address, and the channel's commentator asked anyone listening who lived nearby to see whether he could help. Many callers expressed their sympathies and prayers for the people of Haiti, while others told of fallen buildings and damaged homes. The first photographs from the area showed collapsed walls and sidewalks covered with cinder block-size chunks of debris. One photograph, taken by members of a American church mission group, showed a dead man in a gray T-shirt and jeans, hunched on the ground. Another broadcaster, Radio-Tele Ginen, collected and posted photographs on its Web site that hinted at the damage and
+0	What types of geese are used for human consumption?	roam widely in the knowledge that they will return home by dusk. The Chinese goose is more aggressive and noisy than other geese and can be used as a guard animal to warn of intruders. The flesh of meat geese is dark-coloured and high in protein, but they deposit fat subcutaneously, although this fat
+0	what is name of new video game?	of your favorite fictional universe and weaving them into a new story that captures the imagination of fans. In the dark, cinematic "Batman: Arkham Asylum," Batman must escape from a spooky psychiatric hospital. Buzz is building online for the dark, cinematic game, which reimagines a brooding Batman and his most notorious nemesis, the Joker, for an experience that's reminiscent of "The Dark Knight" blockbuster movie. You want mayhem, insanity and brutality? Get ready for all of it. "This is the Batman movie I would have liked to have written," said Dini, who was scheduled to discuss
+0	what is the name of the large city in southeastern nevada	tourism , including gambling . The city serves as world headquarters for the world 's two largest Fortune 500 gaming companies , Harrah 's Entertainment and MGM Mirage .
+0	what does somalia want	of using force against the pirates, the Russian news agency Interfax reported. "The questions of freeing the ships and crew are being dealt with in line with the corresponding international practices," Interfax quoted Navy spokesman Igor Dygalo as saying. "For understandable reasons, the use of force would be an extreme measure because it could threaten the life of the international crew of the ship." The pirates took over the MV Faina last week off the coast of Somalia and are demanding
+0	when was nepal declared federal democratic republic country	has changed Nepal practically into a federal democratic republic by making 7 unnamed states
+0	animal that live both on land and in water	are animals that live predominantly or entirely on land ( e.g. , cats , ants , spiders ) , as compared with aquatic animals , which live predominantly or entirely in the water ( e.g.g. , frogs , or newts
+0	Conservative theology was the official position of what?	formed by moderate Southern Baptists who disagreed with the direction in which the Southern Baptist Convention was heading: the Alliance of Baptists in 1987 and
+0	What laws did homeland security waive?	environmental groups, Defenders of Wildlife and the Sierra Club, have filed appeals with the U.S. Supreme Court, claiming the waivers are unconstitutional and set a dangerous precedent. "National security and environmental protection do not have to be at odds with each other," says Defenders of Wildlife spokesman Matt Clark. "If we can drop this arbitrary deadline for constructing the fence and go through the proper procedures, then there are inevitably ways to minimize environmental impact, but as it is now it's throwing all of those laws out the window." Mountain lion tracker Jack Childs also worries about the impact of the fence on local wildlife, especially the jaguar.
+1	How many PVA soldiers fought in this battle and lost?	U.S. 2nd Infantry Division Warrior Division's 23rd Regimental Combat Team with an attached French Battalion was hemmed in by more than 25,000 Chinese Communist forces. United Nations forces had previously retreated in the face of large Communist forces instead of getting
+0	who plays sugar ray leonard in hands of stone	Usher Raymond IV as
+0	What is the name of the person titled Drift King who helped with editorial supervision of the sports manga series, Initial D?	is centered on the prefecture of Gunma, more specifically on several mountains in the Kantō region and in their surrounding cities and towns. Although some of the names of the locations the characters race in have been fictionalized, all of the locations in the series are based on actual locations in Japan. He is also known as the Drift King (ドリキン , Dorikin ) for his
+0	Who he beats Larry Hopkins?	stretch in Graterford Prison in Pennsylvania between 1983 and 1988. So impressed was Hopkins with Bozella that he trained with the newcomer ahead of his much-anticipated fight. "It inspired me," said Hopkins of Bozella. "He chose to do what he did, and not only did he get freedom, but he got humanity. An opportunity to do something that was taken from him years and years ago. To be on a major, major fight card. "And now that we became,
+0	where does the rail trail start and end	years ( as of 2011 ) . The trail is also accepted as being , by a large margin , the biggest non-farming economic factor in
+0	when did milwaukee brewers move to the nl	regular season began , two new teams -- the Arizona Diamondbacks and Tampa Bay Devil Rays -- were added by Major League Baseball . This resulted in
+1	when was the group areas act law passed	Act No. 41 of 1950 </Td> </Tr> <Tr> <Th> Enacted by </Th> <Td> Parliament of South Africa </Td> </Tr> <Tr> <Th> Date passed </Th> <Td> 7 July 1950 </Td> </Tr> <Tr> <Th> Date of Royal Assent </Th> <Td> 24 June 1950 </Td> </Tr> <Tr> <Th> Date commenced </Th> <Td> 30 March 1951 (
+0	when was the last hurricane to hit bermuda	which the passengers found surprisingly hospitable . Hurricane Fabian was the most intense storm to impact the territory in modern times , though officially it did not make landfall , and was the only storm to have its name retired for effects
+1	when was the last time the san antonio spurs missed the playoffs	have only missed the playoffs four times since entering the NBA ; they have not missed the playoffs in the 20 seasons since Tim Duncan was drafted by the Spurs in 1997 . With their 50th win in the 2016 -- 17 season , the Spurs extended their record for most consecutive 50 - win seasons to 18 ( the Spurs did not
+0	the creation of the federal reserve system was an attempt to	Reserve System ( also known as the Federal Reserve or simply the Fed ) is the central banking system of the United States of America . Over the years , events such as the Great Depression in the 1930s and the Great Recession during the 2000s have led to the expansion of the
+0	group f / 64 was a major backlash against the earlier photographic movement of	f / 64 was formed , Edward Weston went to a meeting of the John Reed Club , which was founded to support Marxist artists and writers . These circumstances not only helped set up the situation in which a group
+0	Bessarabia eventually became under the control of which country?	city of Vilnius – its historical capital, which was under Polish control during the inter-war
+0	Iran's inflation led to what in 1975-1976?	the economy of Iran was flooded with foreign currency, which caused inflation. By 1974, the economy of Iran was experiencing double digit inflation, and despite many large projects to modernize the country, corruption was rampant and caused large
+1	How many steam warships did Japan have in 1867?	Yokosuka and Nagasaki. By the end of the Tokugawa shogunate in 1867, the Japanese navy of the shogun already possessed eight western-style steam warships around the flagship Kaiyō Maru, which were used against pro-imperial forces during the Boshin war, under the command
+0	How many people were inside?	of former NFL head coach Dan Reeves, suffered a broken back. DeCamillis was seen on a stretcher wearing a neck brace. A line of heavy thunderstorms was moving through the Dallas area at the time, he said, but no other damage to buildings was reported, said Mike Adams, a dispatcher for the Irving, Texas, fire department. Watch the roof collapse on players, coaches » Arnold Payne, a photographer for WFAA, was shooting the Cowboys' practice session when rain began falling "tremendously hard." "I noticed the walls started to waver ... and then I noticed that the lights that were hanging from the ceiling started to sway, and it wouldn't stop," Payne told CNN. Shortly after that, he said, "It was as if someone took a stick pin and hit a balloon." Watch Payne describe being inside when structure collpased » Payne said
+0	Ishita Dutta is the sister of an actress who is typically cast in what genre of movies?	the suspense thriller film "Drishyam" (2015) and the Hindi soap opera "Ek Ghar Banaunga", that aired on Star Plus. She is the younger sister of actress Tanushree Dutta. Dutta is the recipient of Femina Miss India Universe title in 2004. During the same year
+0	when did the the civil war start and end	</Th> </Tr> <Tr> <Td> <P> 110,000 + killed in action / died of wounds 230,000 + accident / disease deaths 25,000 -- 30,000 died in Confederate prisons </P> <P> 365,000 + total dead
+1	What has Pakistan told phone companies?	Islamabad, Pakistan (CNN) -- Under heavy criticism for a telling cell phone carriers to ban certain words in text messages, the Pakistan Telecommunication Authority went into damage control mode Wednesday. PTA spokesman Mohammed Younis Wednesday denied the existence of the plan, which has met with derision from mobile phone users in the country. "If at all we finally decide to
+0	What did Bush say the proposal was to a proposal he vetoed before?	(CNN) -- President Bush vetoed an expansion of the federally funded, state-run health insurance program for poor children for a second time Wednesday, telling Congress the bill "moves our country's health care system in the wrong direction." In his veto message, President Bush calls on Congress to extend funding for the current program. "Because the Congress has chosen to send me an essentially identical bill that has the same problems as the flawed bill I previously vetoed, I must veto this legislation, too," he said in a statement released by the White House. The bill would
+0	Where did the football team that Bob Simmons coached from 1995 to 2000 play their home games?	Cowboys football team [SEP] The 1998 Oklahoma State Cowboys football team represented the Oklahoma State University during the 1998 NCAA Division I-A football season. They participated as members of the Big 12 Conference in the South Division. They were coached by head coach Bob Simmons. [PAR] [TLE] Bob Simmons (American football coach) [SEP] Bob
+0	What anniversary was recently celebrated in Iran?	us to move our policy in a new direction," Obama said. "So there are going to be a set of objectives that we have in these conversations, but I think that there's the possibility at least of a relationship of mutual respect and progress." The United States and Iran have not had diplomatic relations since 1979. During that year, the Shah of Iran was forced to flee the country and the Ayatollah Khomeini took power. Later that year, Iranian students took over and seized hostages at the U.S. Embassy. Relations have been cut since then. U.S. President George W. Bush labeled Iran as a member of the "axis of evil" after the Sept. 11, 2001 attacks. Iran celebrated the 30th anniversary of the revolution Tuesday with crowds chanting "Death to America." Watch the parade in Iran Â» Tensions have rippled over issues such as Iran's nuclear program, Israel, and Iraq, and have been aggravated since the outspoken Ahmadinejad came to power in 2005. Western
+1	Which Italian composer did George Balanchine add in 1976?	[PAR] [TLE] Arcangelo Corelli [SEP] Arcangelo Corelli ( ; 17 February 1653 – 8 January 1713) was an Italian violinist and composer of the Baroque era. His music
+0	Will the playstation 4 be announced?	a new system sometime in the next five years, of course. Sony continued to sell the PlayStation 2 system and games years after the PlayStation 3 debuted in stores. For Sony's next console, the company will not deploy a streaming delivery system like OnLive, or fully cut out disc retailers like Best Buy and GameStop, Hirai said. While Sony has increased the number of games and other media available for download or streaming through its networks, most people cannot be expected to frequently download several gigabytes worth of data, which can be a time-consuming process, he said. Sony Computer Entertainment president Andrew House said earlier that Sony is not planning to discuss a new console, the website ComputerAndVideogames.com reported on Monday.
+1	How many children were the Americans trying to kidnap out of Haiti?	Port-au-Prince, Haiti (CNN) -- A Haitian attorney representing 10 Americans charged with kidnapping for trying to take 33 children out of Haiti told CNN Sunday he has resigned. Edwin Coq said he had quit as a lawyer for the Americans. It wasn't immediately clear who would replace him. "I know that they have been looking at other lawyers," said Phyllis Allison, mother of one of those detained, Jim Allen. "They don't know what to do." The 10 missionaries, including group leader Laura Silsby, were charged Thursday with kidnapping children and criminal association. Coq had said that court hearings would be held Monday
+0	who kills tree gelbman in happy death day	Tree convinces Carter of her predicament by showing that she holds foreknowledge of the day 's events . Tree admits to Carter she does n't like who
+0	What will no person be denied the enjoyment of in Georgia based on their religious principles?	amended as follows: "Article IV. Section 10. No person within this state shall, upon any pretense, be deprived of the inestimable privilege of worshipping God in any
+0	who came up with the idea of football	pass . The popularity of college football grew as it became the dominant version of the sport in the United States for the first half of the 20th century . Bowl games , a college football tradition , attracted a national audience for college
+0	what is the name of the female smurf	before the smurflings created Sassette , Smurfette was the only female smurf in the Smurf Village .
+0	Who contributed to the American studies programs at Yale and University of Wyoming?	struggle. Norman Holmes Pearson, who worked for the Office of Strategic Studies in London during World War II, returned to Yale and headed the new American studies program, in which scholarship quickly became an instrument of promoting
+0	What is the group's former name that now has an office with the Chief Actuary besides the Social Security Administration?	Office of the Chief Actuary [SEP] The Office of the Chief Actuary is a government agency that has responsibility for actuarial estimates regarding social welfare programs. In Canada, the Office of the Chief Actuary works with the Canada Pension Plan and the Old Age Security Program. In the United States, both the Social Security Administration and the Centers for Medicare and Medicaid Services have an Office of the Chief Actuary that deals with Social Security and Medicare, respectively. A similar agency in the United Kingdom is called the Government Actuary's Department
+0	The actor that playes Han Solo in the "Star Wars" film series stars with Blake Lively and Michiel Huisman in a film directed by who?	about a woman who stops aging after an accident at the age of 29. Mills Goodloe and Salvador Paskowitz. The film stars Blake Lively, Michiel Huisman, Kathy Baker, Amanda Crew, Harrison Ford, and Ellen Burstyn. The film was theatrically released on April 24, 2015 by Lionsgate. [PAR] [TLE] Harrison Ford [SEP] Harrison Ford
+0	What historically black university's men's basketball coach was formerly head coach at Virginia Tech?	well as an 1890 Historically Black Land-Grant University. The University is a member-school of the Thurgood Marshall College Fund. He was also the head coach at Virginia Tech, Tennessee
+0	what year did syracuse win the ncaa tournament	. Their combined record is 67 -- 39 .
+1	where do i get chips at a casino	<P> Money is exchanged for tokens in a casino at the casino cage , at the gaming tables , or at a cashier station . The tokens are
+0	when was the winter fuel payment first introduced	heating over the winter months .
+0	Trophy hunting can include areas which would likely be unsuitable for what other types of ecotourism?	study states that less than 3% of a trophy hunters' expenditures reach the local level, meaning that the economic incentive and benefit is "minimal, particularly when we consider the vast areas of
+1	In simple language, what are the interconnections in an embedding matrix?	Since it was quite easy to stack interconnections (wires) inside the embedding matrix, the approach allowed designers to forget completely about the routing of wires (usually a time-consuming operation of PCB design): Anywhere the designer needs a connection, the machine will draw a wire in straight line from one location/pin
+0	who has been to the most all star games in baseball	24 </Li> <Li> Stan Musial 24 </Li>
+0	In 1169, Ireland was invaded by which people?	High King to ensure the terms of the Treaty of Windsor led Henry II, as King of England, to rule as effective monarch under the title of Lord of Ireland. This title was granted to his younger son but when Henry's heir unexpectedly died the title of King of England and Lord of Ireland became entwined in one
+1	What year did a biracial Populist fusion gain the Governors office?	to the legislature and governor's office, but the Populists attracted voters displeased with them. In 1896 a biracial, Populist-Republican Fusionist coalition gained the governor's office. The Democrats regained control of the legislature
+1	nearest metro station to majnu ka tilla delhi	Ring Road of Delhi . It is at a walkable distance from ISBT Kashmere Gate . It is approachable through the Kashmeri Gate station of the Delhi Metro , lies on both the Red ( Dilshad Garden - Rithala ) and Yellow Lines ( Samaypur
+1	where is california located in the region of the united states	<P> California is a U.S. state in the Pacific Region of the United States . With 39.5 million residents , California is the most populous state in the United States and the third largest by area . The
+1	when did the baptist church start in america	coworker for religious freedom , are variously credited as founding the earliest Baptist church in North America . In 1639 , Williams established a Baptist church in Providence , Rhode Island , and Clarke began a Baptist church in
+0	where was the first capital of the united states located	passed to pave the way for a permanent capital . The decision to locate the capital was contentious , but Alexander Hamilton helped broker a compromise in which the federal government would take on war debt incurred during the American Revolutionary War , in exchange for support from northern states for locating the capital along the Potomac
+0	What will new regulations will reduce?	products off of the consumer market," said Michael Fry, director of conservation advocacy for the American Bird Conservancy. "By putting these restrictions in place, they are allowing a compromise to be made between themselves and organizations who have been working on this problem for a long time." The EPA's new measures, which were handed down Thursday, require that rat poisons be kept in bait stations above ground and in containers that meet agency standards. Loose bait, such as pellets, and the four most hazardous types of pesticides, known as "second-generation anticoagulants," will no longer be sold for personal use. Under the new restrictions, only farmers, livestock owners and certified rodent control employees will be allowed to purchase rat poison in bulk. Bags larger than 8 pounds will no longer be sold at hardware and home-improvement stores. Children who come into contact
+0	who played lois lane in the man of steel	mixture of toughness and vulnerability , but Peter Bradshaw thought that the character was `` sketchily conceived '' and criticized her lack of chemistry with Cavill . Even so , the film earned over $660 million to become one of her biggest box
+0	What year did the writer of the 1968 novel "The Iron Man" become Poet Laurete?	Giant is a 1999 American animated science-fiction comedy-drama action film using both traditional animation and computer animation, produced by and directed by Brad Bird in his directorial debut. It is based on the 1968 novel "The Iron Man" by Ted Hughes (which was published in the United States as "The Iron Giant") and was scripted by Tim McCanlies from a story treatment by Bird. The film stars the voices of Eli Marienthal,
+0	The conquest of Nice was an effort by Suleiman and what French king?	allies. A month prior to the siege of Nice, France supported the Ottomans with an artillery unit during the 1543 Ottoman conquest of Esztergom in northern Hungary. After further advances by the Turks, the Habsburg ruler Ferdinand officially recognized Ottoman ascendancy in Hungary in
+0	when was the vaccine received	for swine flu, also known as 2009 H1N1, using reverse genetics, he said. "Suitable viruses will hopefully be sent to manufacturers by end of next week," Skinner wrote. Once that happens, vaccine makers will tweak the virus and have "pilot lots" of vaccine ready to be tested by mid- to late June. Several thousand cases have been reported
+1	What is the nationality of the actor who costarred with Matt LeBlanc in "All the Queen's Men"?	an approximate -99.92% return. [PAR] [TLE] Eddie Izzard [SEP] Edward John "Eddie" Izzard ( ; born 7 February 1962) is an English stand-up comedian, actor, writer and political activist. His comedic style takes the form of rambling, whimsical monologue, and self-referential pantomime. He
+0	What sickened thousands of children?	executives detained, a local official said, according to Xinhua, Initial tests showed more than 1,300 children in the Hunan province town of Wenping have excessive lead in their blood from the Wugang Manganese Smelting Plant. A second round of testing has been ordered to confirm the results. The plant opened in May 2008 without gaining the approval of the local environment protection bureau, said Huang Wenbin, a deputy environment chief in Wugang City, Xinhua reported. The plant was within 500 meters (about a quarter mile) of three schools. The
+0	What percentage of the population are the Kpelle?	are descendants of African American and West Indian, mostly Barbadian settlers, make up 2.5%. Congo people, descendants of repatriated Congo and Afro-Caribbean
+1	Amount of people left homeless?	86 dead, the state news agency said. About 30 people are missing, the official news agency Agencia Brasil said, citing civil defense officials. Earlier reports had indicated as many as 100 people were dead. In addition, more than 54,000 residents have been left homeless, and another 1.5 million have been affected by the heavy rains, the state news agency reported. Brazilian President Luiz Inacio Lula da Silva announced he will release nearly 700 million reais ($350 million)
+0	What other countries were in disagreement with the United Nations decision on Burma ?	that strongly called upon the government of Myanmar to end its systematic violations of human rights. In January 2007, Russia and China vetoed a
+0	Besides Barcelona and Real Madrid, what other team has remained in the Primera Division?	first football club to win six out of six competitions in a single year, completing the sextuple in also winning the Spanish Super Cup, UEFA Super Cup and FIFA Club World Cup. In 2011, the club became
+0	William Frederick Truax, is a former professional American football tight end in the National Football League (NFL) from 1964 to 1973 for the Los Angeles Rams and the Dallas Cowboys, following the 1970 NFL season, Truax was traded by the Rams to the Cowboys for wide receiver Lance Rentzel, a former American football flanker, in which organization?	in New Orleans and college football at Louisiana State University and was drafted in the second round of the 1964 NFL draft. Following the 1970 NFL season, Truax was traded by the Rams to the Cowboys for wide receiver Lance Rentzel. He was part of the Cowboys' Super Bowl VI championship team in 1971. He played
+1	What year did Chopin learn that the uprising in Warsaw was crushed?	enlist. Chopin, now alone in Vienna, was nostalgic for his homeland, and wrote to a friend, "I curse the moment of my departure." When in September 1831 he learned, while travelling from Vienna to Paris, that the uprising had been crushed, he expressed his anguish in the pages of his private journal: "Oh
+1	where do they make money in washington dc	; all coinage is produced by the United States Mint . With production facilities in Washington , DC , and Fort Worth , Texas , the Bureau of Engraving and Printing is the largest producer of government security documents in the United States . </P>
+0	What did a researcher compare this process to?	which makes it one of the highest rates of maternal mortality in the Americas. In wealthy developed nations, only nine women die for every 100,000 births. The five main causes of pregnancy-related deaths in Peru are hemorrhage, pre-eclampsia, infection, complications following abortion and obstructed birth, according to Peru's Ministry of Health figures. Amnesty's Peru researcher Nuria Garcia said, in a written statement: "The rates of maternal mortality in Peru are scandalous. The fact that so many women are dying from preventable causes is a human rights violation. "The Peruvian state is simply ignoring
+0	How many containers can Longtan Containers Port Area handle?	Port of Nanjing is the largest inland port in China, with annual cargo tonnage reached 191,970,000 t in 2012. The port area is 98 kilometres (61 mi) in length and has 64 berths
+0	The 2011 New York City Marathon was sponsored by which Dutch multinational banking corporation?	are retail banking, direct banking, commercial banking, investment banking, asset management, and insurance services. ING is an abbreviation for "Internationale Nederlanden Groep " (English: International Netherlands Group). [PAR] [TLE] 2011 New York City Marathon [SEP] The 42nd New York City Marathon took
+0	What is human flourishing?	it does not involve believing that human nature is purely good or that all people can live up to the Humanist ideals without help. If anything, there is recognition that living up to one's potential is hard
+0	What was the result of Dida appeal	to play in next month's Champions League match at Shakhtar Donetsk after partially winning his appeal to UEFA against a two-match ban. Dida has had one game of his two-match ban suspended for a year following an appeal to UEFA. Brazilian Dida was also fined 60,000 Swiss francs by European football's ruling body following an incident involving a supporter during the Champions clash against Celtic in Scotland on October 3. The 34-year-old Brazilian was initially banned for two games for his theatrics following a Celtic fan's encroachment onto the pitch during the 2-1 defeat at Celtic
+1	What is more plentiful in capital projects?	generates economic distortion in the public sector by diverting public investment into capital projects where bribes and kickbacks are more plentiful. Officials may increase the technical complexity of public sector projects to conceal or
+0	where were band greeted with cheers?	the United States for a show in Stamford, Connecticut, on Tuesday, after they have "a few days off to recuperate," Robinson said. The trio was the opening act for Nelson until they were loudly booed in Toronto, a day after the actor-musician's bizarre interview with a CBC radio host. Ironically, the comments that offended Canadians included Thornton's assessment that they were "very reserved" and "it doesn't matter what you say to them." "It's mashed potatoes with no gravy," Thornton told CBC host Jian Ghomeshi. "We tend to play places where people throw things at each other and here they just sort of sit there," he said. Watch Thornton's interview » The audience at Thursday night's show in Toronto loudly booed the Boxmasters, with some shouts of "Here comes the gravy!" The Toronto Star newspaper reported. Thornton's remarks about
+0	What do Mexicans call Mexico City?	the Federal District in Spanish: D.F., which is read "De-Efe"). They are formally called capitalinos (in reference to the city being the capital of the country), but "[p]erhaps because capitalino is the
+0	where does lock stock and barrel come from	individual components one at a time . One craftsman made the `` lock '' which would have been a `` match lock '' , `` wheel lock '' , `` flint lock '' etc .
+1	who has the power to establish a prison system	<P> The Federal Bureau of Prisons ( BOP ) is a United States federal law enforcement agency . A subdivision of
+0	what are south americas only 2 landlocked countries	such countries , including five partially recognised states .
+0	Who does Johnny Depp play?	is back as another unexpectedly charismatic outlaw: Depression-era bank robber John Dillinger, a character he says he's been drawn to since he was a boy. "I sort of had a fascination with John Dillinger when I was about 10, 11 years old, for some reason," Depp told CNN. "I always kind of admired him, oddly." Oddly, perhaps, because for a short but intense period between September 1933 and July 1934 Dillinger and his gang rampaged through the American Midwest, staging jail breaks, robbing banks, and killing 10 men and wounding seven along the way. Dillinger's violent spree is the focus of gangster drama "Public Enemies," the latest offering from director Michael Mann, and also starring Christian Bale and
+0	What has hurt U.S. tire industry?	S. tire industry. The White House announced plans to impose tariffs on some tires entering the United States from China. The new tariffs will be on passenger car and light truck tires, the White House said in a statement Friday night. "The president decided to remedy the clear disruption to the
+0	What was the name of Patrick Bouvier Kennedy's younger brother, who was known as an American lawyer, journalist, magazine publlisher?	Patrick Bouvier Kennedy [SEP] Patrick Bouvier Kennedy (August 7, 1963 – August 9, 1963) was the last child of United States President John F. Kennedy and First Lady Jacqueline Bouvier Kennedy. He was the younger brother of Caroline and John Jr., and had a third sibling who was stillborn. Born prematurely, Patrick Bouvier Kennedy lived just over 39 hours before desperate attempts to save him failed, putting the First Family and nation into mourning. Three months later, his death was eclipsed by his father's assassination, but eventually his
+0	Who founded the air force that gives to Alessandro Reach the major of "tenente-colonnello"?	Air Force (Italian: Aeronautica Militare; AM) is the aerial defence force of the Italian Republic. After World War II, when Italy was made a republic by referendum, the "Regia Aeronautica" was given
--- a/data/mrqa/convert.py
+++ b/data/mrqa/convert.py
+# coding: utf-8
+f='mrqa-combined.train.raw.json'
+import json
+a=json.load(open(f))
+a=a['data']
+writer = open('train.json','w')
+for s in a:
+    p = s['paragraphs']
+    assert len(p) == 1
+    p = p[0]
+    q = {}
+    q['context'] = p['context']
+    q['qa_list'] = p['qas']
+    writer.write(json.dumps(q)+'\n')
+writer.close()
--- a/data/mrqa/mrqa-combined.dev.raw.json
+++ b/data/mrqa/mrqa-combined.dev.raw.json
--- a/data/mrqa/train.json
+++ b/data/mrqa/train.json
--- a/data/user_define/user_define.md
+++ b/data/user_define/user_define.md
+user define model dataset
--- a/demo.py
+++ b/demo.py
+import paddlepalm as palm
+if __name__ == '__main__':
+    controller = palm.Controller('config.yaml', task_dir='task_instance')
+    controller.load_pretrain('pretrain_model/ernie/params')
+    controller.train()
+    controller = palm.Controller(config='config.yaml', task_dir='task_instance', for_train=False)
+    controller.pred('mrqa', inference_model_dir='output_model/firstrun/infer_model')
--- a/download_pretrain.sh
+++ b/download_pretrain.sh
@@ -21,6 +21,10 @@ else
    exit 1
 fi
+if [[ ! -d pretrain_model ]]; then
+    mkdir pretrain_model
+fi
 cd pretrain_model
 mkdir $name
 cd $name

--- a/mtl_config.yaml
+++ b/mtl_config.yaml
-main_task: "reading_comprehension"
-auxiliary_task: "mask_language_model answer_matching"
-do_train: True
-do_predict: True
-use_cuda: True
-checkpoint_path: "output_model/firstrun"
-backbone_model: "bert_model"
-pretrain_model_path: "pretrain_model/bert"
-pretrain_config_path: "pretrain_model/bert/bert_config.json"
-vocab_path: "pretrain_model/bert/vocab.txt"
-# backbone_model: "ernie_model"
-# pretrain_model_path: "pretrain_model/ernie/params"
-# pretrain_config_path: "pretrain_model/ernie/ernie_config.json"
-# vocab_path: "pretrain_model/ernie/vocab.txt"
-optimizer: "bert_optimizer"
-learning_rate: 3e-5
-lr_scheduler: "linear_warmup_decay"
-skip_steps: 10
-save_steps: 10000
-epoch: 2
-warmup_proportion: 0.1
-weight_decay: 0.1
-do_lower_case: True
-max_seq_len: 512
-use_ema: True
-ema_decay: 0.9999
-random_seed: 0
-loss_scaling: 1.0
--- a/mtl_run.py
+++ b/mtl_run.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# -*- coding: utf-8 -*-
-import os
-import sys
-import time
-import argparse
-import importlib
-import collections
-import numpy as np
-import multiprocessing
-import paddle
-import paddle.fluid as fluid
-from utils.configure import PDConfig
-from utils.placeholder import Placeholder 
-from utils.configure import JsonConfig, ArgumentGroup, print_arguments
-from utils.init import init_pretraining_params, init_checkpoint
-sys.path.append("reader")
-import joint_reader
-from joint_reader import create_reader
-sys.path.append("optimizer")
-sys.path.append("paradigm")
-sys.path.append("backbone")
-TASKSET_PATH="config"
-def train(multitask_config): 
-    # load task config
-    print("Loading multi_task configure...................")
-    args = PDConfig(yaml_file=[multitask_config])
-    args.build()
-    index = 0
-    reader_map_task = dict()
-    task_args_list = list()
-    reader_args_list = list()
-    id_map_task = {index: args.main_task}
-    print("Loading main task configure....................")
-    main_task_name = args.main_task
-    task_config_files = [i for i in os.listdir(TASKSET_PATH) if i.endswith('.yaml')]
-    main_config_list = [config for config in task_config_files if config.split('.')[0] == main_task_name]
-    main_args = None
-    for config in main_config_list: 
-        main_yaml = os.path.join(TASKSET_PATH, config)
-        main_args = PDConfig(yaml_file=[multitask_config, main_yaml])
-        main_args.build()
-        main_args.Print()
-        if not task_args_list or main_task_name != task_args_list[-1][0]: 
-            task_args_list.append((main_task_name, main_args))
-        reader_args_list.append((config.strip('.yaml'), main_args))
-        reader_map_task[config.strip('.yaml')] = main_task_name
-    print("Loading auxiliary tasks configure...................")
-    aux_task_name_list = args.auxiliary_task.strip().split()
-    for aux_task_name in aux_task_name_list: 
-        index += 1
-        id_map_task[index] = aux_task_name
-        print("Loading %s auxiliary tasks configure......." % aux_task_name)
-        aux_config_list = [config for config in task_config_files if config.split('.')[0] == aux_task_name]
-        for aux_yaml in aux_config_list: 
-            aux_yaml = os.path.join(TASKSET_PATH, aux_yaml)
-            aux_args = PDConfig(yaml_file=[multitask_config, aux_yaml])
-            aux_args.build()
-            aux_args.Print()
-            if aux_task_name != task_args_list[-1][0]: 
-                task_args_list.append((aux_task_name, aux_args))
-            reader_args_list.append((aux_yaml.strip('.yaml'), aux_args))
-            reader_map_task[aux_yaml.strip('.yaml')] = aux_task_name
-    # import tasks reader module and build joint_input_shape
-    input_shape_list = []
-    reader_module_dict = {}
-    input_shape_dict = {}
-    for params in task_args_list: 
-        task_reader_mdl = "%s_reader" % params[0]
-        reader_module = importlib.import_module(task_reader_mdl)
-        reader_servlet_cls = getattr(reader_module, "get_input_shape")
-        reader_input_shape = reader_servlet_cls(params[1])
-        reader_module_dict[params[0]] = reader_module
-        input_shape_list.append(reader_input_shape)
-        input_shape_dict[params[0]] = reader_input_shape
-    train_input_shape, test_input_shape, task_map_id = joint_reader.joint_input_shape(input_shape_list)
-    # import backbone model
-    backbone_mdl = args.backbone_model
-    backbone_cls = "Model"
-    backbone_module = importlib.import_module(backbone_mdl)
-    backbone_servlet = getattr(backbone_module, backbone_cls)
-    if not (args.do_train or args.do_predict):
-        raise ValueError("For args `do_train` and `do_predict`, at "
-                         "least one of them must be True.")
-    if args.use_cuda:
-        place = fluid.CUDAPlace(0)
-        dev_count = fluid.core.get_cuda_device_count()
-    else:
-        place = fluid.CPUPlace()
-        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-    exe = fluid.Executor(place)
-    startup_prog = fluid.default_startup_program()
-    if args.random_seed is not None:
-        startup_prog.random_seed = args.random_seed
-    if args.do_train: 
-        #create joint pyreader
-        print('creating readers...')
-        gens = []
-        main_generator = ""
-        for params in reader_args_list: 
-            generator_cls = getattr(reader_module_dict[reader_map_task[params[0]]], "DataProcessor")
-            generator_inst = generator_cls(params[1])
-            reader_generator = generator_inst.data_generator(phase='train', shuffle=True, dev_count=dev_count)
-            if not main_generator: 
-                main_generator = generator_inst
-            gens.append((reader_generator, params[1].mix_ratio, reader_map_task[params[0]]))
-        joint_generator, train_pyreader, model_inputs = create_reader("train_reader", train_input_shape, True, task_map_id, gens)
-        train_pyreader.decorate_tensor_provider(joint_generator)
-        # build task inputs 
-        task_inputs_list = []
-        main_test_input = []
-        task_id = model_inputs[0]
-        backbone_inputs = model_inputs[task_map_id[0][0]: task_map_id[0][1]]
-        for i in range(1, len(task_map_id)): 
-            task_inputs = backbone_inputs + model_inputs[task_map_id[i][0]: task_map_id[i][1]]
-            task_inputs_list.append(task_inputs)
-        # build backbone model
-        print('building model backbone...')
-        conf = vars(args)
-        if args.pretrain_config_path is not None:
-            model_conf = JsonConfig(args.pretrain_config_path).asdict()
-            for k, v in model_conf.items():
-                if k in conf:
-                    assert k == conf[k], "ERROR: argument {} in pretrain_model_config is NOT consistent with which in main.yaml"
-            conf.update(model_conf)
-        backbone_inst = backbone_servlet(conf, is_training=True)
-        print('building task models...')
-        num_train_examples = main_generator.get_num_examples()
-        if main_args.in_tokens:
-            max_train_steps = int(main_args.epoch * num_train_examples) // (
-                    main_args.batch_size // main_args.max_seq_len) // dev_count
-        else:
-            max_train_steps = int(main_args.epoch * num_train_examples) // (
-                main_args.batch_size) // dev_count
-        mix_ratio_list = [task_args[1].mix_ratio for task_args in task_args_list]
-        args.max_train_steps = int(max_train_steps * (sum(mix_ratio_list) / main_args.mix_ratio))
-        print("Max train steps: %d" % max_train_steps)
-        build_strategy = fluid.BuildStrategy()
-        train_program = fluid.default_main_program()
-        with fluid.program_guard(train_program, startup_prog):
-            with fluid.unique_name.guard():
-                backbone_inst.build_model(backbone_inputs)
-                all_loss_list = []
-                for i in range(len(task_args_list)): 
-                    task_name = task_args_list[i][0]
-                    task_args = task_args_list[i][1]
-                    if hasattr(task_args, 'paradigm'):
-                        task_net = task_args.paradigm
-                    else:
-                        task_net = task_name
-                    task_net_mdl = importlib.import_module(task_net)
-                    task_net_cls = getattr(task_net_mdl, "create_model")
-                    output_tensor = task_net_cls(task_inputs_list[i], base_model=backbone_inst, is_training=True, args=task_args)
-                    loss_cls = getattr(task_net_mdl, "compute_loss")
-                    task_loss = loss_cls(output_tensor, task_args)
-                    all_loss_list.append(task_loss)
-                    num_seqs = output_tensor['num_seqs']
-                task_one_hot = fluid.layers.one_hot(task_id, len(task_args_list))
-                all_loss = fluid.layers.concat(all_loss_list, axis=0)
-                loss = fluid.layers.reduce_sum(task_one_hot * all_loss)
-                programs = [train_program, startup_prog]
-                optimizer_mdl = importlib.import_module(args.optimizer)
-                optimizer_inst = getattr(optimizer_mdl, "optimization")
-                optimizer_inst(loss, programs, args=args)
-                loss.persistable = True
-                num_seqs.persistable = True
-                ema = fluid.optimizer.ExponentialMovingAverage(args.ema_decay)
-                ema.update()
-        train_compiled_program = fluid.CompiledProgram(train_program).with_data_parallel(
-            loss_name=loss.name, build_strategy=build_strategy)
-    if args.do_predict:
-        conf = vars(args)
-        if args.pretrain_config_path is not None:
-            model_conf = JsonConfig(args.pretrain_config_path).asdict()
-            for k, v in model_conf.items():
-                if k in conf:
-                    assert v == conf[k], "ERROR: argument {} in pretrain_model_config is NOT consistent with which in main.yaml".format(k)
-            conf.update(model_conf)
-        mod = reader_module_dict[main_task_name]
-        DataProcessor = getattr(mod, 'DataProcessor')
-        predict_processor = DataProcessor(main_args)
-        test_generator = predict_processor.data_generator(
-            phase='predict',
-            shuffle=False,
-            dev_count=dev_count)
-        new_test_input_shape = input_shape_dict[main_task_name][1]['backbone'] + input_shape_dict[main_task_name][1]['task']
-        assert new_test_input_shape == test_input_shape
-        build_strategy = fluid.BuildStrategy()
-        test_prog = fluid.Program()
-        with fluid.program_guard(test_prog, startup_prog):
-            with fluid.unique_name.guard():
-                placeholder = Placeholder(test_input_shape)
-                test_pyreader, model_inputs = placeholder.build(
-                    capacity=100, reader_name="test_reader")
-                test_pyreader.decorate_tensor_provider(test_generator)
-                # create model
-                backbone_inst = backbone_servlet(conf, is_training=False)
-                backbone_inst.build_model(model_inputs)
-                task_net_mdl = importlib.import_module(main_task_name)
-                task_net_cls = getattr(task_net_mdl, "create_model")
-                postprocess = getattr(task_net_mdl, "postprocess")
-                global_postprocess = getattr(task_net_mdl, "global_postprocess")
-                output_tensor = task_net_cls(model_inputs, base_model=backbone_inst, is_training=False, args=main_args)
-                if 'ema' not in dir():
-                    ema = fluid.optimizer.ExponentialMovingAverage(args.ema_decay)
-                pred_fetch_names = []
-                fetch_vars = []
-                for i,j in output_tensor.items():
-                    pred_fetch_names.append(i)
-                    fetch_vars.append(j)
-                for var in fetch_vars:
-                    var.persistable = True
-                pred_fetch_list = [i.name for i in fetch_vars]
-        test_prog = test_prog.clone(for_test=True)
-        test_compiled_program = fluid.CompiledProgram(test_prog).with_data_parallel(
-            build_strategy=build_strategy)
-    exe.run(startup_prog)
-    if args.do_train:
-        if args.pretrain_model_path:
-            init_pretraining_params(
-                exe,
-                args.pretrain_model_path,
-                main_program=startup_prog,
-                use_fp16=False)
-        if args.checkpoint_path:
-            if os.path.exists(args.checkpoint_path):
-                init_checkpoint(
-                    exe,
-                    args.checkpoint_path,
-                    main_program=startup_prog,
-                    use_fp16=False)
-            else:
-                os.makedirs(args.checkpoint_path)
-    elif args.do_predict:
-        if not args.checkpoint_path:
-            raise ValueError("args 'checkpoint_path' should be set if"
-                             "only doing prediction!")
-        init_checkpoint(
-            exe,
-            args.checkpoint_path,
-            main_program=test_prog,
-            use_fp16=False)
-    if args.do_train:
-        print('start training...')
-        train_pyreader.start()
-        steps = 0
-        total_cost, total_num_seqs = [], []
-        time_begin = time.time()
-        while True:
-            try:
-                steps += 1
-                if steps % args.skip_steps == 0:
-                    fetch_list = [loss.name, num_seqs.name, task_id.name]
-                else:
-                    fetch_list = []
-                outputs = exe.run(train_compiled_program, fetch_list=fetch_list)
-                if steps % args.skip_steps == 0:
-                    np_loss, np_num_seqs, np_task_id = outputs
-                    total_cost.extend(np_loss * np_num_seqs)
-                    total_num_seqs.extend(np_num_seqs)
-                    time_end = time.time()
-                    used_time = time_end - time_begin
-                    current_example, epoch = main_generator.get_train_progress()
-                    cur_task_name = id_map_task[np_task_id[0][0]]
-                    print("epoch: %d, task_name: %s, progress: %d/%d, step: %d, loss: %f, "
-                          "speed: %f steps/s" %
-                          (epoch, cur_task_name, current_example, num_train_examples, steps,
-                           np.sum(total_cost) / np.sum(total_num_seqs),
-                           args.skip_steps / used_time))
-                    total_cost, total_num_seqs = [], []
-                    time_begin = time.time()
-                if steps % args.save_steps == 0:
-                    save_path = os.path.join(args.checkpoint_path,
-                                             "step_" + str(steps))
-                    fluid.io.save_persistables(exe, save_path, train_program)
-                if steps == max_train_steps:
-                    save_path = os.path.join(args.checkpoint_path,
-                                             "step_" + str(steps) + "_final")
-                    fluid.io.save_persistables(exe, save_path, train_program)
-                    break
-            except paddle.fluid.core.EOFException as err:
-                save_path = os.path.join(args.checkpoint_path,
-                                         "step_" + str(steps) + "_final")
-                fluid.io.save_persistables(exe, save_path, train_program)
-                train_pyreader.reset()
-                break
-    if args.do_predict:
-        print('start predicting...')
-        cnt = 0
-        if args.use_ema:
-            with ema.apply(exe):
-                test_pyreader.start()
-                pred_buf = []
-                while True:
-                    try:
-                        fetch_res = exe.run(fetch_list=pred_fetch_list, program=test_compiled_program)
-                        cnt += 1
-                        if cnt % 200 == 0:
-                            print('predicting {}th batch...'.format(cnt))
-                        fetch_dict = {}
-                        for key,val in zip(pred_fetch_names, fetch_res):
-                            fetch_dict[key] = val
-                        res = postprocess(fetch_dict)
-                        if res is not None:
-                            pred_buf.extend(res)
-                    except fluid.core.EOFException:
-                        test_pyreader.reset()
-                        break
-                global_postprocess(pred_buf, predict_processor, args, main_args)
-        else:
-            test_pyreader.start()
-            pred_buf = []
-            while True:
-                try:
-                    fetch_res = exe.run(fetch_list=pred_fetch_list, program=test_compiled_program)
-                    cnt += 1
-                    if cnt % 200 == 0:
-                        print('predicting {}th batch...'.format(cnt))
-                    fetch_dict = {}
-                    for key,val in zip(pred_fetch_names, fetch_res):
-                        fetch_dict[key] = val
-                    res = postprocess(fetch_dict)
-                    if res is not None:
-                        pred_buf.extend(res)
-                except fluid.core.EOFException:
-                    test_pyreader.reset()
-                    break
-            global_postprocess(pred_buf, predict_processor, args, main_args)
-if __name__ == '__main__':
-    multitask_config = "mtl_config.yaml"
-    train(multitask_config)
--- a/optimizer/README.md
+++ b/optimizer/README.md
-该目录存放优化器，用户可通过实现相关接口完成自定义
--- a/backbone/__init__.py
+++ b/backbone/__init__.py
--- a/paddlepalm/__init__.py
+++ b/paddlepalm/__init__.py
+import sys
+from paddlepalm.mtl_controller import Controller
+sys.path.append('paddlepalm')
--- a/backbone/utils/__init__.py
+++ b/backbone/utils/__init__.py
--- a/paradigm/__init__.py
+++ b/paradigm/__init__.py
--- a/backbone/bert_model.py
+++ b/backbone/bert_model.py
+# -*- coding: UTF-8 -*-
 #   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -11,24 +12,27 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""BERT model"""
+"""v1.1 
+BERT model."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import paddle.fluid as fluid
+from paddle import fluid
 from paddle.fluid import layers
-import backbone.utils.transformer as transformer
+from paddlepalm.backbone.utils.transformer import pre_process_layer, encoder
+from paddlepalm.interface import backbone
-class Model(object):
+class Model(backbone):
    def __init__(self,
                 config,
-                 is_training=False,
+                 phase):
-                 model_name=''):
+        # self._is_training = phase == 'train' # backbone一般不用关心运行阶段，因为outputs在任何阶段基本不会变
        self._emb_size = config["hidden_size"]
        self._n_layer = config["num_hidden_layers"]
        self._n_head = config["num_attention_heads"]
@@ -39,8 +43,6 @@ class Model(object):
        self._prepostprocess_dropout = config["hidden_dropout_prob"]
        self._attention_dropout = config["attention_probs_dropout_prob"]
-        self._is_training = is_training
        self.model_name = model_name
        self._word_emb_name = self.model_name + "word_embedding"
@@ -52,35 +54,48 @@ class Model(object):
        self._param_initializer = fluid.initializer.TruncatedNormal(
            scale=config["initializer_range"])
-    def build_model(self, reader_input, use_fp16=False):
+    @property
+    def inputs_attr(self):
-        dtype = "float16" if use_fp16 else "float32"
+        return {"token_ids": [-1, self._max_position_seq_len, 1], 'int64'],
+                "position_ids": [-1, self._max_position_seq_len, 1], 'int64'],
+                "segment_ids": [-1, self._max_position_seq_len, 1], 'int64'],
+                "input_mask": [-1, self._max_position_seq_len, 1], 'float32']}
-        src_ids, pos_ids, sent_ids, input_mask = reader_input[:4]
+    @property
+    def outputs_attr(self):
+        return {"word_emb": [-1, self._max_position_seq_len, self._emb_size],
+                "sentence_emb": [-1, self._emb_size],
+                "sentence_pair_emb": [-1, self._emb_size]}
+    def build(self, inputs):
+        src_ids = inputs['token_ids']
+        pos_ids = inputs['position_ids']
+        sent_ids = inputs['segment_ids']
+        input_mask = inputs['input_mask']
        # padding id in vocabulary must be set to 0
-        emb_out = fluid.layers.embedding(
+        emb_out = layers.embedding(
            input=src_ids,
            size=[self._voc_size, self._emb_size],
-            dtype=dtype,
+            dtype="float32",
            param_attr=fluid.ParamAttr(
                name=self._word_emb_name, initializer=self._param_initializer),
            is_sparse=False)
        self.emb_out = emb_out
-        position_emb_out = fluid.layers.embedding(
+        position_emb_out = layers.embedding(
            input=pos_ids,
            size=[self._max_position_seq_len, self._emb_size],
-            dtype=dtype,
+            dtype="float32",
            param_attr=fluid.ParamAttr(
                name=self._pos_emb_name, initializer=self._param_initializer))
        self.position_emb_out = position_emb_out
-        sent_emb_out = fluid.layers.embedding(
+        sent_emb_out = layers.embedding(
            sent_ids,
            size=[self._sent_types, self._emb_size],
-            dtype=dtype,
+            dtype="float32"
            param_attr=fluid.ParamAttr(
                name=self._sent_emb_name, initializer=self._param_initializer))
@@ -88,24 +103,21 @@ class Model(object):
        emb_out = emb_out + position_emb_out + sent_emb_out
-        emb_out = transformer.pre_process_layer(
+        emb_out = pre_process_layer(
            emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder')
-        if dtype == "float16":
+        self_attn_mask = layers.matmul(
-            input_mask = fluid.layers.cast(x=input_mask, dtype=dtype)
-        self_attn_mask = fluid.layers.matmul(
            x = input_mask, y = input_mask, transpose_y = True)
-        self_attn_mask = fluid.layers.scale(
+        self_attn_mask = layers.scale(
            x = self_attn_mask, scale = 10000.0, bias = -1.0, bias_after_scale = False)
-        n_head_self_attn_mask = fluid.layers.stack(
+        n_head_self_attn_mask = layers.stack(
            x=[self_attn_mask] * self._n_head, axis=1)
        n_head_self_attn_mask.stop_gradient = True
-        self._enc_out = transformer.encoder(
+        enc_out = encoder(
            enc_input = emb_out,
            attn_bias = n_head_self_attn_mask,
            n_layer = self._n_layer,
@@ -123,9 +135,9 @@ class Model(object):
            param_initializer = self._param_initializer,
            name = self.model_name + 'encoder')
-        next_sent_feat = fluid.layers.slice(
+        next_sent_feat = layers.slice(
-            input = self._enc_out, axes = [1], starts = [0], ends = [1])
+            input = enc_out, axes = [1], starts = [0], ends = [1])
-        self.next_sent_feat = fluid.layers.fc(
+        next_sent_feat = layers.fc(
            input = next_sent_feat,
            size = self._emb_size,
            act = "tanh",
@@ -133,19 +145,12 @@ class Model(object):
                name = self.model_name + "pooled_fc.w_0", 
                initializer = self._param_initializer),
            bias_attr = "pooled_fc.b_0")
-    @property
-    def final_word_representation(self):
-        """final layer output of transformer encoder as the (contextual) word representation"""
-        return self._enc_out
-    @property
-    def final_sentence_representation(self):
-        """final representation of the first token ([CLS]) as sentence representation """
-        return self.next_sent_feat
+        return {'word_emb': enc_out,
+                'sentence_emb': next_sent_feat, 
+                'sentence_pair_emb': next_sent_feat}
-if __name__ == "__main__":
+    def postprocess(self, rt_outputs):
-    print("hello world!")
+        pass
--- a/paddlepalm/backbone/bow.py
+++ b/paddlepalm/backbone/bow.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from paddle import fluid
+from paddle.fluid import layers
+class Model(backbone):
+    def __init__(self, config, phase):
+        # self._is_training = phase == 'train' # backbone一般不用关心运行阶段，因为outputs在任何阶段基本不会变
+        self._emb_size = config["emb_size"]
+        self._voc_size = config["vocab_size"]
+    @property
+    def inputs_attr(self):
+        return {"token_ids": [-1, self._max_position_seq_len, 1], 'int64']}
+    @property
+    def outputs_attr(self):
+        return {"word_emb": [-1, self._max_position_seq_len, self._emb_size],
+                "sentence_emb": [-1, self._emb_size*2]}
+    def build(self, inputs):
+        tok_ids = inputs['token_ids']
+        emb_out = layers.embedding(
+            input=tok_ids,
+            size=[self._voc_size, self._emb_size],
+            dtype='float32',
+            param_attr=fluid.ParamAttr(
+                name='word_emb', 
+                initializer=fluid.initializer.TruncatedNormal(scale=0.1)),
+            is_sparse=False)
+        sent_emb1 = layers.reduce_mean(emb_out, axis=1)
+        sent_emb2 = layers.reduce_max(emb_out, axis=1)
+        sent_emb = layers.concat([sent_emb1, sent_emb2], axis=1)
+        return {'word_emb': emb_out,
+                'sentence_emb': sent_emb}
+    def postprocess(self, rt_outputs):
+        pass
--- a/backbone/ernie_model.py
+++ b/backbone/ernie_model.py
+# -*- coding: UTF-8 -*-
 #   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,16 +20,20 @@ from __future__ import print_function
 from __future__ import unicode_literals
 from __future__ import absolute_import
-import paddle.fluid as fluid
+from paddle import fluid
+from paddle.fluid import layers
-import backbone.utils.transformer4ernie as transformer
+from paddlepalm.backbone.utils.transformer import pre_process_layer, encoder
+from paddlepalm.interface import backbone
-class Model(object):
+class Model(backbone):
    def __init__(self,
                 config,
-                 is_training=False,
+                 phase):
-                 ):
+        # self._is_training = phase == 'train' # backbone一般不用关心运行阶段，因为outputs在任何阶段基本不会变
        self._emb_size = config['hidden_size']
        self._n_layer = config['num_hidden_layers']
@@ -40,6 +45,8 @@ class Model(object):
        else:
            self._sent_types = config['type_vocab_size']
+        self._task_types = config['task_type_vocab_size']
        self._hidden_act = config['hidden_act']
        self._prepostprocess_dropout = config['hidden_dropout_prob']
        self._attention_dropout = config['attention_probs_dropout_prob']
@@ -53,12 +60,29 @@ class Model(object):
        self._param_initializer = fluid.initializer.TruncatedNormal(
            scale=config['initializer_range'])
+    @property
+    def inputs_attr(self):
+        return {"token_ids": [[-1, -1, 1], 'int64'],
+                "position_ids": [[-1, -1, 1], 'int64'],
+                "segment_ids": [[-1, -1, 1], 'int64'],
+                "input_mask": [[-1, -1, 1], 'float32'],
+                "task_ids": [[-1,-1, 1], 'int64']}
+    @property
+    def outputs_attr(self):
+        return {"word_embedding": [[-1, -1, self._emb_size], 'float32'],
+                "encoder_outputs": [[-1, -1, self._emb_size], 'float32'],
+                "sentence_embedding": [[-1, self._emb_size], 'float32'],
+                "sentence_pair_embedding": [[-1, self._emb_size], 'float32']}
-    def build_model(self, reader_input, use_fp16=False):
+    def build(self, inputs):
-        dtype = "float16" if use_fp16 else "float32"
+        src_ids = inputs['token_ids']
+        pos_ids = inputs['position_ids']
+        sent_ids = inputs['segment_ids']
+        input_mask = inputs['input_mask']
+        task_ids = inputs['task_ids']
-        src_ids, pos_ids, sent_ids, input_mask = reader_input[:4]
        # padding id in vocabulary must be set to 0
        emb_out = fluid.layers.embedding(
            input=src_ids,
@@ -85,12 +109,19 @@ class Model(object):
        emb_out = emb_out + position_emb_out
        emb_out = emb_out + sent_emb_out
-        emb_out = transformer.pre_process_layer(
+        task_emb_out = fluid.layers.embedding(
+            task_ids,
+            size=[self._task_types, self._emb_size],
+            dtype=self._emb_dtype,
+            param_attr=fluid.ParamAttr(
+                name=self._task_emb_name,
+                initializer=self._param_initializer))
+        emb_out = emb_out + task_emb_out
+        emb_out = pre_process_layer(
            emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder')
-        if dtype == "float16":
-            emb_out = fluid.layers.cast(x=emb_out, dtype=dtype)
-            input_mask = fluid.layers.cast(x=input_mask, dtype=dtype)
        self_attn_mask = fluid.layers.matmul(
            x=input_mask, y=input_mask, transpose_y=True)
@@ -100,7 +131,7 @@ class Model(object):
            x=[self_attn_mask] * self._n_head, axis=1)
        n_head_self_attn_mask.stop_gradient = True
-        self._enc_out = transformer.encoder(
+        enc_out = encoder(
            enc_input=emb_out,
            attn_bias=n_head_self_attn_mask,
            n_layer=self._n_layer,
@@ -117,20 +148,11 @@ class Model(object):
            postprocess_cmd="dan",
            param_initializer=self._param_initializer,
            name='encoder')
-        if dtype == "float16":        
-            self._enc_out = fluid.layers.cast(
-                x=self._enc_out, dtype=self._emb_dtype)
-    @property
-    def final_word_representation(self):
-        return self._enc_out
-    @property
-    def final_sentence_representation(self):
-        """Get the first feature of each sequence for classification"""
        next_sent_feat = fluid.layers.slice(
-            input=self._enc_out, axes=[1], starts=[0], ends=[1])
+            input=enc_out, axes=[1], starts=[0], ends=[1])
+        next_sent_feat = fluid.layers.reshape(next_sent_feat, [-1, next_sent_feat.shape[-1]])
        next_sent_feat = fluid.layers.fc(
            input=next_sent_feat,
            size=self._emb_size,
@@ -138,5 +160,11 @@ class Model(object):
            param_attr=fluid.ParamAttr(
                name="pooled_fc.w_0", initializer=self._param_initializer),
            bias_attr="pooled_fc.b_0")
-        return next_sent_feat
+        return {'word_embedding': emb_out,
+                'encoder_outputs': enc_out,
+                'sentence_embedding': next_sent_feat,
+                'sentence_pair_embedding': next_sent_feat}
+    def postprocess(self, rt_outputs):
+        pass
--- a/reader/__init__.py
+++ b/reader/__init__.py
--- a/backbone/utils/transformer4ernie.py
+++ b/backbone/utils/transformer4ernie.py
+# -*- coding: UTF-8 -*-
 #   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -22,7 +23,6 @@ from functools import partial
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 def multi_head_attention(queries,
                         keys,
                         values,

--- a/paradigm/answer_matching.py
+++ b/paradigm/answer_matching.py
+# -*- coding: UTF-8 -*-
 #   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -11,49 +12,31 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# -*- coding: utf-8 -*-
-import paddle.fluid as fluid
+BACKBONE_DIR='paddlepalm.backbone'
+TASK_INSTANCE_DIR='paddlepalm.task_instance'
+READER_DIR='paddlepalm.reader'
+PARADIGM_DIR='paddlepalm.task_paradigm'
+OPTIMIZER_DIR='paddlepalm.optimizer'
+OPTIMIZE_METHOD='optimize'
+REQUIRED_ARGS={
+    'task_instance': str,
+    'backbone': str,
+    'optimizer': str,
+    'learning_rate': float,
+    'batch_size': int
+    }
+OPTIONAL_ARGS={
+    'mix_ratio': str,
+    'target_tag': str,
+    'reuse_rag': str
+    }
+TASK_REQUIRED_ARGS={
+    'paradigm': str,
+    'reader': str,
+    'train_file': str
+    }
-def compute_loss(output_tensors, args=None):
-    """Compute loss for mrc model"""
-    labels = output_tensors['labels']
-    logits = output_tensors['logits']
-    ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
-        logits=logits, label=labels, return_softmax=True)
-    loss = fluid.layers.mean(x=ce_loss)
-    return loss
-def create_model(reader_input, base_model=None, is_training=True, args=None):
-    """
-        given the base model, reader_input
-        return the output tensors
-    """
-    labels = reader_input[-1]
-    cls_feats = base_model.final_sentence_representation
-    cls_feats = fluid.layers.dropout(
-        x=cls_feats,
-        dropout_prob=0.1,
-        dropout_implementation="upscale_in_train")
-    logits = fluid.layers.fc(
-        input=cls_feats,
-        size=2,
-        param_attr=fluid.ParamAttr(
-            name="cls_out_w",
-            initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
-        bias_attr=fluid.ParamAttr(
-            name="cls_out_b", initializer=fluid.initializer.Constant(0.)))
-    num_seqs = fluid.layers.fill_constant(shape=[1], value=512, dtype='int64') 
-    output_tensors = {}
-    output_tensors['labels'] = labels
-    output_tensors['logits'] = logits
-    output_tensors['num_seqs'] = num_seqs
-    return output_tensors
--- a/paddlepalm/interface.py
+++ b/paddlepalm/interface.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""v1.1"""
+class reader(object):
+    """interface of data manager."""
+    def __init__(self, config):
+        assert isinstance(config, dict)
+    # @property
+    # def inputs_attr(self):
+    #     """描述reader输入对象的属性，包含各个对象的名字、shape以及数据类型。当某个对象为标量数据类型（如str, int, float等）时，shape设置为空列表[]，当某个对象的某个维度长度可变时，shape中的相应维度设置为-1.
+    #     Return:
+    #         dict类型。对各个输入对象的属性描述。例如，
+    #         对于文本分类任务，可能需要包含输入文本和所属标签的id
+    #             {"text": ([], 'str'),
+    #              "label": ([], 'int')}
+    #         对于标注任务，可能需要输入词序列和对应的标签
+    #             {"tokens", ([-1], 'str'),
+    #              "tags", ([-1], 'str')}
+    #         对于机器阅读理解任务，可能需要包含上下文、问题、回答、答案区域的起止位置等
+    #             {"paragraph", ([], 'str'),
+    #              "question", ([], 'str'),
+    #              "start_position", ([], 'int')
+    #         """
+    #     raise NotImplementedError()
+    @property
+    def outputs_attr(self):
+        """描述reader输出对象（被yield出的对象）的属性，包含各个对象的名字、shape以及数据类型。当某个对象为标量数据类型（如str, int, float等）时，shape设置为空列表[]，当某个对象的某个维度长度可变时，shape中的相应维度设置为-1。
+        注意：当使用mini-batch梯度下降学习策略时，，应为常规的输入对象设置batch_size维度（一般为-1）
+        Return:
+            dict类型。对各个输入对象的属性描述。例如，
+            对于文本分类和匹配任务，yield的输出内容可能包含如下的对象（下游backbone和task可按需访问其中的对象）
+                {"token_ids": ([-1, max_len], 'int64'),
+                 "input_ids": ([-1, max_len], 'int64'),
+                 "segment_ids": ([-1, max_len], 'int64'),
+                 "input_mask": ([-1, max_len], 'float32'),
+                 "label": ([-1], 'int')}
+        """
+        raise NotImplementedError()
+    # def parse_line(self):
+    #     """框架内部使用字典描述每个样本，字典的key为inputs_attr，value为每个input对应的符合attr描述的值。
+    #         该函数负责将文本行解析成符合inputs_attr描述的字典类型的样本。默认的parse_line方法会读取json格式的数据集文件，数据集的每一行为json格式描述的样本。
+    #         用户可通过对该方法的继承改写来适配不同格式的数据集，例如csv格式甚至tfrecord文件。
+    #         """
+    #     raise NotImplementedError()
+    # 
+    # def tokenize(self, line):
+    #     """框架中内置了word piece tokenizer等分词器，用户可通过修改tokenizer超参数来制定使用的分词器，若内置的分词器均无法满足需求，用户可通过对该方法的继承改写来自定义分词器。
+    #         Args:
+    #             - line: a unicode string. 
+    #         Return:
+    #             a list of tokens
+    #         """
+    #     raise NotImplementedError()
+    def iterator(self):
+        """数据集遍历接口，注意，当数据集遍历到尾部时该接口应自动完成指针重置，即重新从数据集头部开始新的遍历。
+        Yield:
+            (dict) elements that meet the requirements in output_templete
+        """
+        raise NotImplementedError()
+    @property
+    def num_examples(self):
+        """数据集中的样本数量，即每个epoch中iterator所生成的样本数。注意，使用滑动窗口等可能导致数据集样本数发生变化的策略时，该接口应返回runtime阶段的实际样本数。"""
+        raise NotImplementedError()
+class backbone(object):
+    """interface of backbone model."""
+    def __init__(self, config, phase):
+        """
+        Args:
+            config: dict类型。描述了 多任务配置文件+预训练模型配置文件 中定义超参数
+            phase: str类型。运行阶段，目前支持train和predict
+            """
+        assert isinstance(config, dict)
+    @property
+    def inputs_attr(self):
+        """描述backbone从reader处需要得到的输入对象的属性，包含各个对象的名字、shape以及数据类型。当某个对象为标量数据类型（如str, int, float等）时，shape设置为空列表[]，当某个对象的某个维度长度可变时，shape中的相应维度设置为-1。
+        Return:
+            dict类型。对各个输入对象的属性描述。例如，
+            对于文本分类和匹配任务，bert backbone依赖的reader对象主要包含如下的对象
+                {"token_ids": ([-1, max_len], 'int64'),
+                 "input_ids": ([-1, max_len], 'int64'),
+                 "segment_ids": ([-1, max_len], 'int64'),
+                 "input_mask": ([-1, max_len], 'float32')}"""
+        raise NotImplementedError()
+    @property
+    def outputs_attr(self):
+        """描述backbone输出对象的属性，包含各个对象的名字、shape以及数据类型。当某个对象为标量数据类型（如str, int, float等）时，shape设置为空列表[]，当某个对象的某个维度长度可变时，shape中的相应维度设置为-1。
+        Return:
+            dict类型。对各个输出对象的属性描述。例如，
+            对于文本分类和匹配任务，bert backbone的输出内容可能包含如下的对象
+                {"word_emb": ([-1, max_seqlen, word_emb_size], 'float32'),
+                 "sentence_emb": ([-1, hidden_size], 'float32'),
+                 "sim_vec": ([-1, hidden_size], 'float32')}""" 
+        raise NotImplementedError()
+    def build(self, inputs):
+        """建立backbone的计算图。将符合inputs_attr描述的静态图Variable输入映射成符合outputs_attr描述的静态图Variable输出。
+        Args:
+            inputs: dict类型。字典中包含inputs_attr中的对象名到计算图Variable的映射，inputs中至少会包含inputs_attr中定义的对象
+        Return:
+           需要输出的计算图变量，输出对象会被加入到fetch_list中，从而在每个训练/推理step时得到runtime的计算结果，该计算结果会被传入postprocess方法中供用户处理。
+            """
+        raise NotImplementedError()
+class task_paradigm(object):
+    def __init__(self, config, phase, backbone_config):
+        """
+            config: dict类型。描述了 任务实例(task instance)+多任务配置文件 中定义超参数
+            phase: str类型。运行阶段，目前支持train和predict
+            """
+    @property
+    def inputs_attrs(self):
+        """描述task_layer需要从reader, backbone等输入对象集合所读取到的输入对象的属性，第一级key为对象集和的名字，如backbone，reader等（后续会支持更灵活的输入），第二级key为对象集和中各对象的属性，包括对象的名字，shape和dtype。当某个对象为标量数据类型（如str, int, float等）时，shape设置为空列表[]，当某个对象的某个维度长度可变时，shape中的相应维度设置为-1。
+        Return:
+            dict类型。对各个对象集及其输入对象的属性描述。"""
+        raise NotImplementedError()
+    @property
+    def outputs_attr(self):
+        """描述task输出对象的属性，包括对象的名字，shape和dtype。输出对象会被加入到fetch_list中，从而在每个训练/推理step时得到runtime的计算结果，该计算结果会被传入postprocess方法中供用户处理。
+        当某个对象为标量数据类型（如str, int, float等）时，shape设置为空列表[]，当某个对象的某个维度长度可变时，shape中的相应维度设置为-1。
+        Return:
+            dict类型。对各个输入对象的属性描述。注意，训练阶段必须包含名为loss的输出对象。
+            """
+        raise NotImplementedError()
+    def build(self, inputs):
+        """建立task_layer的计算图。将符合inputs_attrs描述的来自各个对象集的静态图Variables映射成符合outputs_attr描述的静态图Variable输出。
+        Args:
+            inputs: dict类型。字典中包含inputs_attrs中的对象名到计算图Variable的映射，inputs中至少会包含inputs_attr中定义的对象
+        Return:
+           需要输出的计算图变量，输出对象会被加入到fetch_list中，从而在每个训练/推理step时得到runtime的计算结果，该计算结果会被传入postprocess方法中供用户处理。
+        """
+        raise NotImplementedError()
+    def postprocess(self, rt_outputs):
+        """每个训练或推理step后针对当前batch的task_layer的runtime计算结果进行相关后处理。注意，rt_outputs除了包含build方法，还自动包含了loss的计算结果。"""
+        pass
+    def post_postprocess(self, global_buffer):
+        pass
--- a/paddlepalm/mtl_controller.py
+++ b/paddlepalm/mtl_controller.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+import importlib
+import multiprocessing
+from paddle import fluid
+from paddle.fluid import layers
+import yaml
+import json
+import logging
+import time
+import numpy as np
+from paddlepalm.utils.saver import init_pretraining_params, init_checkpoint
+from paddlepalm.utils.config_helper import PDConfig
+from paddlepalm.utils.print_helper import print_dict
+from paddlepalm.utils.reader_helper import create_net_inputs, create_iterator_fn, create_joint_iterator_fn, merge_input_attrs 
+from paddlepalm.default_settings import *
+from paddlepalm.task_instance import TaskInstance, check_instances
+DEBUG=False
+VERBOSE=0
+def _get_basename(f):
+    return os.path.splitext(f)[0]
+def _get_suffix(f):
+    return os.path.splitext(f)[-1]
+def _parse_yaml(f, asdict=True, support_cmd_line=False):
+    assert os.path.exists(f), "file {} not found.".format(f)
+    if support_cmd_line:
+        args = PDConfig(yaml_file=f, fuse_args=True)
+        args.build()
+        return args.asdict() if asdict else args
+    else:
+        if asdict:
+            with open(f, "r") as fin: 
+                yaml_config = yaml.load(fin, Loader=yaml.SafeLoader)
+            return yaml_config
+        else:
+            raise NotImplementedError()
+def _parse_json(f, asdict=True, support_cmd_line=False):
+    assert os.path.exists(f), "file {} not found.".format(f)
+    if support_cmd_line:
+        args = PDConfig(json_file=f, fuse_args=support_cmd_line)
+        args.build()
+        return args.asdict() if asdict else args
+    else:
+        if asdict:
+            with open(f, "r") as fin: 
+                config = json.load(fin)
+            return config
+        else:
+            raise NotImplementedError()
+def _parse_list(string, astype=str):
+    assert isinstance(string, str), "{} is not a string.".format(string)
+    if ',' not in string:
+        return [astype(string)]
+    string = string.replace(',', ' ')
+    return [astype(i) for i in string.split()]
+def _try_float(s):
+    try:
+        float(s)
+        return(float(s))
+    except:
+        return s
+def _check_conf(conf, checklist=None):
+    assert isinstance(conf, dict), "{} is not a dict.".format(conf)
+    ret = {}
+    for k,v in conf.items():
+        if isinstance(v, str):
+            v = _try_float(v)
+        ret[k] = v
+    if checklist is not None:
+        for k, t in checklist:
+            assert k in ret, "required argument {} is NOT exist in config file.".format(k)
+            assert isintance(ret[k], t), "value type of argument {} should be {}".format(k, t)
+    return ret
+# TODO: 增加None机制，允许hidden size、batch size和seqlen设置为None
+def _check_io(in_attr, out_attr, strict=False, in_name="left", out_name="right"):
+    for name, attr in in_attr.items():
+        assert name in out_attr, in_name+': '+name+' not found in '+out_name
+        if attr != out_attr[name]:
+            if strict:
+                raise ValueError(name+': shape or dtype not consistent!')
+            else:
+                logging.warning('{}: shape or dtype not consistent!\n{}:\n{}\n{}:\n{}'.format(name, in_name, attr, out_name, out_attr[name]))
+def _merge_conf(conf1, conf2, conf1_first=True, strict=False):
+    assert isinstance(conf1, dict), "{} is not a dict.".format(conf1)
+    assert isinstance(conf2, dict), "{} is not a dict.".format(conf2)
+    base_conf = conf2 if conf1_first else conf1
+    base_conf = base_conf.copy()
+    new_conf = conf1 if conf1_first else conf2
+    for k, v in new_conf.items():
+        if k in base_conf:
+            if base_conf[k] != v:
+                raise Warning("value of argument {} has been updated to {}.".format(k, v))
+        else:
+            if strict:
+                continue
+        base_conf[k] = v
+    return base_conf
+def _encode_inputs(inputs, scope_name, sep='/', cand_set=None):
+    outputs = {}
+    for k, v in inputs.items():
+        if cand_set is not None:
+            if k in cand_set:
+                outputs[k] = v
+            if scope_name+sep+k in cand_set:
+                outputs[scope_name+sep+k] = v
+        else:
+            outputs[scope_name+sep+k] = v
+    return outputs
+def _decode_inputs(inputs, scope_name, sep='/', keep_unk_keys=True):
+    outputs = {}
+    for name, value in inputs.items():
+        # var for backbone are also available to tasks
+        if keep_unk_keys and sep not in name:
+            outputs[name] = value
+        # var for this inst
+        if name.startswith(scope_name+'/'):
+            outputs[name[len(scope_name+'/'):]] = value
+    return outputs
+def _init_env(use_gpu):
+    if use_gpu:
+        place = fluid.CUDAPlace(0)
+        dev_count = fluid.core.get_cuda_device_count()
+    else:
+        place = fluid.CPUPlace()
+        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+    return fluid.Executor(place), dev_count
+def _fit_attr(conf, fit_attr, strict=False):
+    for i, attr in fit_attr.items():
+        if i not in conf:
+            if strict:
+                raise Exception('Argument {} is required to create a controller.'.format(i))
+            else:
+                continue
+        conf[i] = attr(conf[i])
+    return conf
+class Controller(object):
+    def __init__(self, config=None, task_dir='.', for_train=True):
+        """
+        Args:
+            config: (str|dict) 字符串类型时，给出yaml格式的config配置文件路径；
+        """
+        self._for_train = for_train
+        # default mtl_conf
+        # if config is None and config_path is None:
+        #     raise ValueError('For config and config_path, at least one of them should be set.')
+        if isinstance(config, str):
+            mtl_conf = _parse_yaml(config, support_cmd_line=True)
+            # if config is not None:
+            #     mtl_conf = _merge_conf(config, mtl_conf)
+        else:
+            mtl_conf = config
+        mtl_conf = _check_conf(mtl_conf)
+        mtl_conf = _fit_attr(mtl_conf, REQUIRED_ARGS, strict=True)
+        mtl_conf = _fit_attr(mtl_conf, OPTIONAL_ARGS, strict=False)
+        exe, dev_count = _init_env(use_gpu=mtl_conf.get('use_gpu', True))
+        self.exe = exe
+        self.dev_count = dev_count
+        print_dict(mtl_conf, title='main configuration')
+        # parse task instances and target tags
+        instnames = _parse_list(mtl_conf['task_instance'])
+        assert len(instnames) == len(set(instnames)), "repeated task_instance is NOT supported."
+        num_instances = len(instnames)
+        self.num_instances = num_instances
+        instname_to_conf = {}
+        instname_to_id = {}
+        for id, instname in enumerate(instnames):
+            instpath = os.path.join(task_dir, instname+'.yaml')
+            conf = _parse_yaml(instpath, support_cmd_line=False)
+            # conf = _check_conf(conf, TASK_INSTANCE_REQUIRED_ARGS)
+            conf = _check_conf(conf)
+            temp_conf = _merge_conf(mtl_conf, conf, strict=True)
+            print_dict(temp_conf, title='{} configuration'.format(instname))
+            conf = _merge_conf(mtl_conf, conf)
+            instname_to_conf[instname] = conf
+            instname_to_id[instname] = id
+        # create task instances
+        instances = []
+        for name in instnames:
+            instances.append(TaskInstance(name, instname_to_id[name], instname_to_conf[name]))
+        check_instances(instances)
+        # parse target_tag
+        if 'target_tag' in mtl_conf:
+            target_tag = str(mtl_conf['target_tag'])
+            tags = _parse_list(target_tag, astype=int)
+            assert len(tags) == len(instnames), "number of target_tag is NOT consistent with that in task_instance."
+            for tag, inst in zip(tags, instances):
+                inst.is_target = tag
+        else:
+            tags = [i.is_target for i in instances]
+        num_targets = sum(tags)
+        num_auxes = num_instances - num_targets
+        # parse mix ratios
+        if 'mix_ratio' in mtl_conf:
+            mix_ratio = str(mtl_conf['mix_ratio'])
+            mrs = _parse_list(mix_ratio, astype=float)
+            assert len(mrs) == num_instances, "number of mix_ratios is NOT consistent with num_instances."
+        else:
+            # TODO: 增加joint training模式，让num_epochs平等的作用于每个instance
+            mrs = [1.0] * num_instances
+        for mr, inst in zip(mrs, instances):
+            inst.mix_ratio = mr
+        # parse task layer reuse tags
+        instname_to_reusehost = {i:i for i in instnames}
+        if 'task_reuse_tag' in mtl_conf:
+            tags = _parse_list(mtl_conf['task_reuse_tag'], astype=int)
+            assert len(tags) == num_targets, 'number of reuse_tags is NOT consistent with number of instances.'
+        else:
+            tags = []
+            mapper = {}
+            for inst in instances:
+                # 有环则tag_id + 1，否则被mapper shutdown
+                history = set()
+                history.add(inst.name)
+                cur_inst = inst
+                while True:
+                    # 发现有环
+                    if cur_inst.task_reuse_scope in history:
+                        mapper[inst.name] = len(tags)
+                        break
+                    # 发现在mapper中
+                    elif cur_inst.task_reuse_scope in mapper:
+                        mapper[inst.name] = mapper[cur_inst.task_reuse_scope]
+                        break
+                    else:
+                        cur_inst = name_to_instance[cur_inst.task_reuse_scope]
+                        history.add(cur_inst.name)
+                tags.append(mapper[inst.name])
+            # 注意，上面这段需要做单元测试
+        for i in range(1, num_instances):
+            for j in range(i):
+                if tags[i] == tags[j]:
+                    # check paradigm of reused tasks
+                    assert instances[i].task_paradigm == \
+                            instances[j].task_paradigm, \
+                            "paradigm of reuse tasks should be consistent"
+                    instances[i].task_reuse_scope = instances[j].name
+                    break
+        # parse Reader and Paradigm for each instance
+        for inst in instances:
+            reader_name = inst.config['reader']
+            reader_mod = importlib.import_module(READER_DIR + '.' + reader_name)
+            Reader = getattr(reader_mod, 'Reader')
+            parad_name = inst.config['paradigm']
+            parad_mod = importlib.import_module(PARADIGM_DIR + '.' + parad_name)
+            Paradigm = getattr(parad_mod, 'TaskParadigm')
+            inst.Reader = Reader
+            inst.Paradigm = Paradigm
+        # prepare backbone
+        if 'backbone_config_path' in mtl_conf:
+            bb_conf = _parse_json(mtl_conf['backbone_config_path'])
+            bb_conf = _merge_conf(mtl_conf, bb_conf)
+        else:
+            bb_conf = mtl_conf
+        print_dict(bb_conf, title='backbone configuration'.format(instname))
+        bb_name = mtl_conf['backbone']
+        bb_mod = importlib.import_module(BACKBONE_DIR + '.' + bb_name)
+        Backbone = getattr(bb_mod, 'Model')
+        self.instances = instances
+        self.mrs = mrs
+        self.Backbone = Backbone
+        self.bb_conf = bb_conf
+        self.bb_name = bb_name
+        self.has_init_train = False
+        self.has_init_pred = False
+        if self._for_train:
+            print("initialing for training...")
+            self._init_train()
+            self.has_init_train = True
+    def _init_train(self):
+        instances = self.instances
+        Backbone = self.Backbone
+        bb_conf = self.bb_conf
+        bb_name = self.bb_name
+        dev_count = self.dev_count
+        num_instances = len(instances)
+        mrs = self.mrs
+        # set first_target/main task instance
+        main_inst = None
+        for inst in instances:
+            if inst.is_target:
+                main_inst = inst
+                inst.is_first_target = True
+                break
+        main_conf = main_inst.config
+        if not os.path.exists(main_conf['save_path']):
+            os.makedirs(main_conf['save_path'])
+        # prepare backbone
+        train_backbone = Backbone(bb_conf, phase='train')
+        pred_backbone = Backbone(bb_conf, phase='pred')
+        # create reader, task
+        # then check i/o across reader, backbone and task_layer
+        task_attrs = []
+        pred_task_attrs = []
+        for inst in instances:
+            train_reader = inst.Reader(inst.config, phase='train')
+            inst.reader['train'] = train_reader
+            train_parad = inst.Paradigm(inst.config, phase='train', backbone_config=bb_conf)
+            inst.task_layer['train'] = train_parad
+            task_attr_from_reader = _encode_inputs(train_parad.inputs_attrs['reader'], inst.name)
+            task_attrs.append(task_attr_from_reader)
+            _check_io(train_backbone.inputs_attr, train_reader.outputs_attr, in_name=bb_name+'_backbone', out_name='reader.train')
+            _check_io(train_parad.inputs_attrs['reader'], train_reader.outputs_attr, in_name='task_paradigm.train.reader', out_name='reader.train')
+            _check_io(train_parad.inputs_attrs['backbone'], train_backbone.outputs_attr, in_name='task_paradigm.train.backbone', out_name=bb_name+'_backbone')
+            if inst.is_target:
+                if 'pred_file' not in inst.config:
+                    inst.config['pred_file'] = ''
+                pred_reader = inst.Reader(inst.config, phase='pred')
+                pred_parad = inst.Paradigm(inst.config, phase='pred', backbone_config=bb_conf)
+                # inst.reader['pred'] = pred_reader # 这里创建的reader是个假reader，只是为了读取output_attr而已，所以不做保存
+                inst.task_layer['pred'] = pred_parad
+                # 框架有巨坑，先这样写吧
+                task_attr_from_reader = _encode_inputs(pred_parad.inputs_attrs['reader'], inst.name)
+                pred_task_attrs.append(task_attr_from_reader)
+                # task_attr = pred_parad.inputs_attrs['reader']
+                _check_io(pred_backbone.inputs_attr, pred_reader.outputs_attr, in_name=bb_name+'_backbone', out_name='reader.pred')
+                _check_io(pred_parad.inputs_attrs['reader'], pred_reader.outputs_attr, in_name='task_paradigm.pred.reader', out_name='reader.pred')
+                _check_io(pred_parad.inputs_attrs['backbone'], pred_backbone.outputs_attr, in_name='task_paradigm.pred.backbone', out_name=bb_name+'_backbone')
+        # merge reader input attrs from backbone and task_instances
+        joint_input_names, joint_shape_and_dtypes, name_to_position = merge_input_attrs(train_backbone.inputs_attr, task_attrs)
+        pred_joint_input_names, pred_joint_shape_and_dtypes, _ = merge_input_attrs(pred_backbone.inputs_attr, pred_task_attrs, insert_taskid=False)
+        # shapes: [task_id, shapes_of_backbone, shapes_of_inst1, ..., shapes_of_instN]
+        if DEBUG:
+            print('----- for debug -----')
+            print('joint input names:')
+            print(joint_input_names)
+            print('joint input shape and dtypes:')
+            print(joint_shape_and_dtypes)
+        # load data
+        for inst in instances:
+            print(inst.name+": preparing data...")
+            inst.reader['train'].load_data()
+        # merge dataset iterators and create net input vars
+        iterators = []
+        prefixes = []
+        mrs = []
+        for inst in instances:
+            iterators.append(inst.reader['train'].iterator())
+            prefixes.append(inst.name)
+            mrs.append(inst.mix_ratio)
+        joint_iterator_fn = create_joint_iterator_fn(iterators, prefixes, joint_shape_and_dtypes, mrs, name_to_position, dev_count=dev_count, verbose=VERBOSE)
+        input_attrs = [[i, j, k] for i, (j,k) in zip(joint_input_names, joint_shape_and_dtypes)]
+        pred_input_attrs = [[i, j, k] for i, (j,k) in zip(pred_joint_input_names, pred_joint_shape_and_dtypes)]
+        net_inputs = create_net_inputs(input_attrs, async=True, iterator_fn=joint_iterator_fn, dev_count=dev_count, n_prefetch=3)
+        # build backbone and task layers
+        # 不指定scope名字会挂，框架有坑
+        with fluid.unique_name.guard("backbone-"):
+            bb_output_vars = train_backbone.build(net_inputs)
+        # bb_output_vars = train_backbone.build(net_inputs)
+        assert sorted(bb_output_vars.keys()) == sorted(train_backbone.outputs_attr.keys())
+        # 会挂
+        # 这里是否有必要新建一个program？是的，被坑死了
+        pred_prog = fluid.Program()
+        pred_init_prog = fluid.Program()
+        train_prog = fluid.default_main_program()
+        train_init_prog = fluid.default_startup_program()
+        with fluid.program_guard(main_program = pred_prog, startup_program = pred_init_prog):
+            pred_net_inputs = create_net_inputs(pred_input_attrs)
+            with fluid.unique_name.guard("backbone-"):
+                pred_bb_output_vars = pred_backbone.build(pred_net_inputs)
+        fluid.framework.switch_main_program(train_prog)
+        fluid.framework.switch_startup_program(train_init_prog)
+        # pred_backbone = train_backbone
+        # pred_bb_output_vars = bb_output_vars
+        task_output_vars = {}
+        for inst in instances:
+            task_inputs = {'backbone': bb_output_vars}
+            task_inputs_from_reader = _decode_inputs(net_inputs, inst.name)
+            task_inputs['reader'] = task_inputs_from_reader
+            scope = inst.task_reuse_scope + '/'
+            with fluid.unique_name.guard(scope):
+                output_vars = inst.build_task_layer(task_inputs, phase='train')
+                output_vars = {inst.name+'/'+key: val for key, val in output_vars.items()}
+                old = len(task_output_vars) # for debug
+                task_output_vars.update(output_vars)
+                assert len(task_output_vars) - old == len(output_vars) # for debug
+            # # prepare predict vars for saving inference model
+            if inst.is_target:
+                # task_attr = inst.task_layer['pred'].inputs_attrs['reader']
+                # _input_names, _shape_and_dtypes, _ = merge_input_attrs(pred_backbone.inputs_attr, task_attr, insert_taskid=False)
+                # pred_input_attrs = [[i, j, k] for i, (j,k) in zip(_input_names, _shape_and_dtypes)]
+                with fluid.program_guard(pred_prog, pred_init_prog):
+                    # pred_net_inputs = create_net_inputs(pred_input_attrs)
+                    # 这里同时建立了pred阶段的backbone计算图，不知道是否会造成额外的显存开销（paddle不会计算运行路径）
+                    cur_inputs = _decode_inputs(pred_net_inputs, inst.name)
+                    inst.pred_input = cur_inputs
+                    pred_task_inputs = {'backbone': pred_bb_output_vars, 'reader': cur_inputs}
+                    scope = inst.task_reuse_scope + '/'
+                    with fluid.unique_name.guard(scope):
+                        inst.build_task_layer(pred_task_inputs, phase='pred')
+        bb_fetches = {k: v.name for k,v in bb_output_vars.items()}
+        task_fetches = {k: v.name for k,v in task_output_vars.items()}
+        old = len(bb_fetches)+len(task_fetches) # for debug
+        fetches = bb_fetches.copy()
+        fetches.update(task_fetches)
+        assert len(fetches) == old # for debug
+        fetches['__task_id'] = net_inputs['__task_id'].name
+        # compute loss
+        task_id_var = net_inputs['__task_id']
+        task_id_vec = layers.one_hot(task_id_var, num_instances)
+        losses = fluid.layers.concat([task_output_vars[inst.name+'/loss'] for inst in instances], axis=0)
+        loss = layers.reduce_sum(task_id_vec * losses)
+        main_reader = main_inst.reader['train']
+        num_examples = main_reader.num_examples
+        for inst in instances:
+            max_train_steps = int(main_conf['num_epochs']* inst.mix_ratio * num_examples) // main_conf['batch_size']  // dev_count
+            print('{}: expected train steps {}.'.format(inst.name, max_train_steps))
+            inst.steps_pur_epoch = inst.reader['train'].num_examples // main_conf['batch_size']  // dev_count
+            inst.expected_train_steps = max_train_steps
+        global_max_train_steps = int(main_conf['num_epochs'] * num_examples * sum(mrs)) // main_conf['batch_size']  // dev_count
+        print('Estimated overall train steps {}.'.format(global_max_train_steps))
+        if 'warmup_proportion' in main_conf and main_conf['warmup_proportion'] > 0:
+            warmup_steps = int(global_max_train_steps * main_conf['warmup_proportion'])
+            print('Warmup steps: '+str(warmup_steps))
+        else:
+            warmup_steps = 0
+        # steps_pur_epoch = num_examples // main_conf['batch_size'] // dev_count
+        # build optimizer
+        # 其实也完全可以支持每个任务用它自己的optimizer
+        if 'optimizer' in main_conf:
+            optim_mod = importlib.import_module(OPTIMIZER_DIR + '.' + main_conf['optimizer'])
+            optimize = getattr(optim_mod, OPTIMIZE_METHOD)
+            optimize(loss, main_conf, max_train_steps, warmup_steps, fluid.default_main_program())
+            loss.persistable = True
+            if main_conf.get('use_ema', False):
+                assert 'ema_decay' in main_conf, "ema_decay should be set when use_ema is enabled."
+                ema = fluid.optimizer.ExponentialMovingAverage(main_conf['ema_decay'])
+                ema.update()
+        # prepare for train
+        self.train_backbone = train_backbone
+        self.train_program = fluid.CompiledProgram(fluid.default_main_program()).with_data_parallel(loss_name=loss.name)
+        self.saver_program = fluid.default_main_program()
+        self.main_inst = main_inst
+        self.fetches = fetches
+        self.has_init_train = True
+        self.has_init_pred = True
+        # self.max_train_steps = max_train_steps
+        # self.steps_pur_epoch = steps_pur_epoch
+        self.exe.run(fluid.default_startup_program())
+        print("\nRandomly initialize parameters...\n")
+    def _init_pred(self, instance, infer_model_path):
+        inst = instance
+        pred_backbone = self.Backbone(self.bb_conf, phase='pred')
+        pred_parad = inst.Paradigm(inst.config, phase='pred', backbone_config=self.bb_conf)
+        inst.task_layer['pred'] = pred_parad
+        pred_joint_input_names, pred_joint_shape_and_dtypes, name_to_position = merge_input_attrs(
+            pred_backbone.inputs_attr, inst.task_layer['pred'].inputs_attrs['reader'], 
+            insert_taskid=False)
+        pred_prog = inst.load(infer_model_path)
+        # pred_prog = fluid.CompiledProgram(pred_prog).with_data_parallel()
+        if inst.reader['pred'] is None:
+            pred_reader = inst.Reader(inst.config, phase='pred')
+            inst.reader['pred'] = pred_reader
+        return pred_prog
+    def load_pretrain(self, pretrain_model_path=None):
+        # load pretrain model (or ckpt)
+        if pretrain_model_path is None:
+            assert 'pretrain_model_path' in self.main_conf, "pretrain_model_path NOT set."
+            pretrain_model_path = self.main_conf['pretrain_model_path']
+        init_pretraining_params(
+            self.exe,
+            pretrain_model_path,
+            main_program=fluid.default_startup_program())
+    def train(self):
+        # TODO: 备份各种配置文件，以便用户断点重新训练以及支持将来的预测
+        if not self.has_init_train:
+            self._init_train()
+            self.has_init_train = True
+        instances = self.instances
+        num_instances = self.num_instances
+        main_inst = self.main_inst
+        main_conf = main_inst.config
+        backbone = self.train_backbone
+        train_program = self.train_program
+        saver_program = self.saver_program
+        fetches = self.fetches
+        # max_train_steps = self.max_train_steps
+        # steps_pur_epoch = self.steps_pur_epoch
+        finish = []
+        for inst in instances:
+            if inst.is_target:
+                finish.append(False)
+        def train_finish():
+            for inst in instances:
+                if inst.is_target:
+                    if not inst.train_finish:
+                        return False
+            return True
+        # do training
+        # loss_fetches = {inst.name+'/loss': inst.task_layer['train'].loss for inst in instances}
+        # old = len(fetches) # for debug
+        # fetches.update(loss_fetches)
+        # assert len(fetches) == old + len(loss_fetches) # for debug and avoid user-caused bug
+        # assert 'task_id' not in fetches # for debug and avoid user-caused bug
+        # fetches['task_id'] = task_id_var
+        fetch_names, fetch_list = zip(*fetches.items())
+        main_step = 0 # only count for main task
+        global_step = 0 # count for all tasks
+        epoch = 0
+        time_begin = time.time()
+        backbone_buffer = []
+        task_buffer = [[]] * num_instances
+        while not train_finish():
+            rt_outputs = self.exe.run(train_program, fetch_list=fetch_list)
+            rt_outputs = {k:v for k,v in zip(fetch_names, rt_outputs)}
+            rt_task_id = np.squeeze(rt_outputs['__task_id']).tolist()
+            assert (not isinstance(rt_task_id, list)) or len(set(rt_task_id)) == 1
+            rt_task_id = rt_task_id[0] if isinstance(rt_task_id, list) else rt_task_id
+            cur_task = instances[rt_task_id]
+            backbone_rt_outputs = {k:v for k,v in rt_outputs.items() if '/' not in k}
+            backbone_buffer.append(backbone.postprocess(backbone_rt_outputs))
+            task_rt_outputs = {k[len(cur_task.name+'/'):]: v for k,v in rt_outputs.items() if k.startswith(cur_task.name+'/')}
+            temp = instances[rt_task_id].task_layer['train'].postprocess(task_rt_outputs)
+            task_buffer[rt_task_id].append(temp)
+            global_step += 1
+            # if cur_task.is_target:
+            cur_task.cur_train_step += 1
+            if global_step % main_conf.get('print_every_n_steps', 5) == 0:
+                loss = rt_outputs[cur_task.name+'/loss']
+                loss = np.mean(np.squeeze(loss)).tolist()
+                time_end = time.time()
+                time_cost = time_end - time_begin
+                print("Global step: {}. Task: {}, step {}/{} (epoch {}), loss: {:.3f}, speed: {:.2f} steps/s".format(
+                       global_step, cur_task.name, cur_task.cur_train_step, cur_task.steps_pur_epoch, cur_task.cur_train_epoch,
+                       loss, main_conf.get('print_every_n_steps', 5) / time_cost))
+                time_begin = time.time()
+            if 'save_every_n_steps' in main_conf and global_step % main_conf['save_every_n_steps'] == 0:
+                save_path = os.path.join(main_conf['save_path'],
+                                         "step_" + str(global_step))
+                fluid.io.save_persistables(self.exe, save_path, saver_program)
+        save_path = os.path.join(main_conf['save_path'],
+                                 "step_" + str(global_step) + "_final")
+        fluid.io.save_persistables(self.exe, save_path, saver_program)
+    def pred(self, task_instance, inference_model_dir=None):
+        if self._for_train:
+            raise Exception('This controller is a trainer. Please build a new controller with for_train=False for predicting.')
+        assert isinstance(task_instance, str)
+        if isinstance(inference_model_dir, str):
+            assert os.path.exists(inference_model_dir), inference_model_dir+" not found."
+        if not self.has_init_pred and inference_model_dir is None:
+            raise ValueError('infer_model_path is required for prediction.')
+        instance = None
+        for inst in self.instances:
+            if inst.name == task_instance:
+                instance = inst
+                break
+        if instance is None:
+            raise ValueError(task_instance + ' is not a valid task_instance.')
+        pred_prog = self._init_pred(instance, inference_model_dir)
+        inst = instance
+        inst.reader['pred'].load_data()
+        fetch_names, fetch_vars = inst.pred_fetch_list
+        # iterator = create_iterator_fn(inst.reader['pred'].iterator, inst.name, pred_joint_shape_and_dtypes, name_to_position)
+        mapper = {k:v for k,v in inst.pred_input}
+        buf = []
+        for feed in inst.reader['pred'].iterator():
+            feed = _encode_inputs(feed, inst.name, cand_set=mapper)
+            feed = {mapper[k]: v for k,v in feed.items()}
+            rt_outputs = self.exe.run(pred_prog, feed, fetch_vars)
+            rt_outputs = {k:v for k,v in zip(fetch_names, rt_outputs)}
+            inst.postprocess(rt_outputs, phase='pred')
+        reader_outputs = inst.reader['pred'].get_epoch_outputs()
+        inst.epoch_postprocess({'reader':reader_outputs}, phase='pred')
+if __name__ == '__main__':
+    assert len(sys.argv) == 2, "Usage: python mtl_controller.py <mtl_conf_path>"
+    conf_path = sys.argv[1]
+    del sys.argv[1]
+    controller = Controller(conf_path)
+    if controller.main_conf['do_train']:
+        controller.train()
--- a/utils/__init__.py
+++ b/utils/__init__.py
--- a/optimizer/bert_optimizer.py
+++ b/optimizer/bert_optimizer.py
+# -*- coding: UTF-8 -*-
 #   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -48,25 +49,23 @@ def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps):
        return lr
-def optimization(loss, programs, args): 
+def optimize(loss, config, max_train_steps=None, warmup_steps=0, train_program=None):
-    train_program = programs[0]
-    startup_prog = programs[1]
-    warmup_steps = args.max_train_steps * args.warmup_proportion
    if warmup_steps > 0:
-        if args.lr_scheduler == 'noam_decay':
+        decay_strategy = config.get('lr_scheduler', 'linear_warmup_decay')
+        if decay_strategy == 'noam_decay':
            scheduled_lr = fluid.layers.learning_rate_scheduler\
-             .noam_decay(1/(warmup_steps *(float(args.learning_rate) ** 2)),
+             .noam_decay(1/(warmup_steps *(config['learning_rate'] ** 2)),
                         warmup_steps)
-        elif args.lr_scheduler == 'linear_warmup_decay':
+        elif decay_strategy == 'linear_warmup_decay':
-            scheduled_lr = linear_warmup_decay(float(args.learning_rate), warmup_steps,
+            scheduled_lr = linear_warmup_decay(config['learning_rate'], warmup_steps,
-                                               args.max_train_steps)
+                                               max_train_steps)
        else:
-            raise ValueError("Unkown learning rate scheduler, should be "
+            raise ValueError("Unkown lr_scheduler, should be "
                             "'noam_decay' or 'linear_warmup_decay'")
        optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr)
    else:
-        optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+        optimizer = fluid.optimizer.Adam(learning_rate=config['learning_rate'])
-        scheduled_lr = args.learning_rate
+        scheduled_lr = config['learning_rate']
    clip_norm_thres = 1.0
    # When using mixed precision training, scale the gradient clip threshold
@@ -91,13 +90,19 @@ def optimization(loss, programs, args):
    _, param_grads = optimizer.minimize(loss)
-    if args.weight_decay > 0:
+    for block in fluid.default_main_program().blocks:
+        for var_name in block.vars:
+            if var_name.startswith("embedding"):
+                print(block.vars[var_name])
+    if config.get('weight_decay', 0) > 0:
        for param, grad in param_grads:
            if exclude_from_weight_decay(param.name):
                continue
            with param.block.program._optimized_guard(
                [param, grad]), fluid.framework.name_scope("weight_decay"):
                updated_param = param - param_list[
-                    param.name] * args.weight_decay * scheduled_lr
+                    param.name] * config['weight_decay'] * scheduled_lr
                fluid.layers.assign(output=param, input=updated_param)
--- a/paddlepalm/reader/__init__.py
+++ b/paddlepalm/reader/__init__.py
--- a/paddlepalm/reader/cls4bert.py
+++ b/paddlepalm/reader/cls4bert.py
--- a/paddlepalm/reader/match4ernie.py
+++ b/paddlepalm/reader/match4ernie.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddlepalm.interface import reader
+from paddlepalm.reader.utils.reader4ernie import ClassifyReader
+class Reader(reader):
+    def __init__(self, config, phase='train', dev_count=1, print_prefix=''):
+        """
+        Args:
+            phase: train, eval, pred
+            """
+        self._is_training = phase == 'train'
+        reader = ClassifyReader(config['vocab_path'],
+            max_seq_len=config['max_seq_len'],
+            do_lower_case=config.get('do_lower_case', False),
+            for_cn=config.get('for_cn', False),
+            random_seed=config.get('seed', None))
+        self._reader = reader
+        self._dev_count = dev_count
+        self._batch_size = config['batch_size']
+        self._max_seq_len = config['max_seq_len']
+        if phase == 'train':
+            self._input_file = config['train_file']
+            self._num_epochs = None # 防止iteartor终止
+            self._shuffle = config.get('shuffle', False)
+            self._shuffle_buffer = config.get('shuffle_buffer', 5000)
+        elif phase == 'eval':
+            self._input_file = config['dev_file']
+            self._num_epochs = 1
+            self._shuffle = False
+            self._batch_size = config.get('pred_batch_size', self._batch_size)
+        elif phase == 'pred':
+            self._input_file = config['pred_file']
+            self._num_epochs = 1
+            self._shuffle = False
+            self._batch_size = config.get('pred_batch_size', self._batch_size)
+        self._phase = phase
+        # self._batch_size = 
+        self._print_first_n = config.get('print_first_n', 1)
+    @property
+    def outputs_attr(self):
+        if self._is_training:
+            return {"token_ids": [[-1, -1, 1], 'int64'],
+                    "position_ids": [[-1, -1, 1], 'int64'],
+                    "segment_ids": [[-1, -1, 1], 'int64'],
+                    "input_mask": [[-1, -1, 1], 'float32'],
+                    "label_ids": [[-1,1], 'int64'],
+                    "task_ids": [[-1, -1, 1], 'int64']
+                    }
+        else:
+            return {"token_ids": [[-1, -1, 1], 'int64'],
+                    "position_ids": [[-1, -1, 1], 'int64'],
+                    "segment_ids": [[-1, -1, 1], 'int64'],
+                    "task_ids": [[-1, -1, 1], 'int64'],
+                    "input_mask": [[-1, -1, 1], 'float32']
+                    }
+    def load_data(self):
+        self._data_generator = self._reader.data_generator(self._input_file, self._batch_size, self._num_epochs, dev_count=self._dev_count, shuffle=self._shuffle, phase=self._phase)
+    def iterator(self): 
+        def list_to_dict(x):
+            names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask', 
+                'label_ids', 'unique_ids']
+            outputs = {n: i for n,i in zip(names, x)}
+            del outputs['unique_ids']
+            if not self._is_training:
+                del outputs['label_ids']
+            return outputs
+        for batch in self._data_generator():
+            yield list_to_dict(batch)
+    def get_epoch_outputs(self):
+        return {'examples': self._reader.get_examples(self._phase),
+                'features': self._reader.get_features(self._phase)}
+    @property
+    def num_examples(self):
+        return self._reader.get_num_examples(phase=self._phase)
--- a/paddlepalm/reader/mlm.py
+++ b/paddlepalm/reader/mlm.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddlepalm.interface import reader
+from paddlepalm.reader.utils.reader4ernie import BaseReader
+class Reader(reader):
+    def __init__(self, config, phase='train', dev_count=1, print_prefix=''):
+        """
+        Args:
+            phase: train, eval, pred
+            """
+        self._is_training = phase == 'train'
+        reader = ClassifyReader(config['vocab_path'],
+            max_seq_len=config['max_seq_len'],
+            do_lower_case=config.get('do_lower_case', False),
+            for_cn=config.get('for_cn', False),
+            random_seed=config.get('seed', None))
+        self._reader = reader
+        self._dev_count = dev_count
+        self._batch_size = config['batch_size']
+        self._max_seq_len = config['max_seq_len']
+        if phase == 'train':
+            self._input_file = config['train_file']
+            self._num_epochs = None # 防止iteartor终止
+            self._shuffle = config.get('shuffle', False)
+            self._shuffle_buffer = config.get('shuffle_buffer', 5000)
+        elif phase == 'eval':
+            self._input_file = config['dev_file']
+            self._num_epochs = 1
+            self._shuffle = False
+            self._batch_size = config.get('pred_batch_size', self._batch_size)
+        elif phase == 'pred':
+            self._input_file = config['pred_file']
+            self._num_epochs = 1
+            self._shuffle = False
+            self._batch_size = config.get('pred_batch_size', self._batch_size)
+        self._phase = phase
+        # self._batch_size = 
+        self._print_first_n = config.get('print_first_n', 1)
+    @property
+    def outputs_attr(self):
+        if self._is_training:
+            return {"token_ids": [[-1, -1, 1], 'int64'],
+                    "position_ids": [[-1, -1, 1], 'int64'],
+                    "segment_ids": [[-1, -1, 1], 'int64'],
+                    "input_mask": [[-1, -1, 1], 'float32'],
+                    "label_ids": [[-1,1], 'int64'],
+                    "task_ids": [[-1, -1, 1], 'int64']
+                    }
+        else:
+            return {"token_ids": [[-1, -1, 1], 'int64'],
+                    "position_ids": [[-1, -1, 1], 'int64'],
+                    "segment_ids": [[-1, -1, 1], 'int64'],
+                    "task_ids": [[-1, -1, 1], 'int64'],
+                    "input_mask": [[-1, -1, 1], 'float32']
+                    }
+    def load_data(self):
+        self._data_generator = self._reader.data_generator(self._input_file, self._batch_size, self._num_epochs, dev_count=self._dev_count, shuffle=self._shuffle, phase=self._phase)
+    def iterator(self): 
+        def list_to_dict(x):
+            names = ['token_ids', 'position_ids', 'segment_ids', 'input_mask', 
+                'task_ids', 'mask_label', 'mask_pos']
+            outputs = {n: i for n,i in zip(names, x)}
+            del outputs['unique_ids']
+            if not self._is_training:
+                del outputs['label_ids']
+            return outputs
+        for batch in self._data_generator():
+            yield list_to_dict(batch)
+    def get_epoch_outputs(self):
+        return {'examples': self._reader.get_examples(self._phase),
+                'features': self._reader.get_features(self._phase)}
+    @property
+    def num_examples(self):
+        return self._reader.get_num_examples(phase=self._phase)
--- a/paddlepalm/reader/mrc4bert.py
+++ b/paddlepalm/reader/mrc4bert.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from paddlepalm.interface import reader
+from paddlepalm.utils.textprocess_helper import is_whitespace
+from paddlepalm.reader.utils.mrqa_helper import MRQAExample, MRQAFeature
+import paddlepalm.tokenizer.bert_tokenizer as tokenization
+class Reader(reader):
+    def __init__(self, config, phase='train', dev_count=1, print_prefix=''):
+        """
+        Args:
+            phase: train, eval, pred
+            """
+        self._is_training = phase == 'train'
+        self._tokenizer = tokenization.FullTokenizer(
+            vocab_file=config['vocab_path'], do_lower_case=config.get('do_lower_case', False))
+        self._max_seq_length = config['max_seq_len']
+        self._doc_stride = config['doc_stride']
+        self._max_query_length = config['max_query_len']
+        if phase == 'train':
+            self._input_file = config['train_file']
+            self._num_epochs = config['num_epochs']
+            self._shuffle = config.get('shuffle', False)
+            self._shuffle_buffer = config.get('shuffle_buffer', 5000)
+        if phase == 'eval':
+            self._input_file = config['dev_file']
+            self._num_epochs = 1
+            self._shuffle = False
+        elif phase == 'pred':
+            self._input_file = config['predict_file']
+            self._num_epochs = 1
+            self._shuffle = False
+        # self._batch_size = 
+        self._batch_size = config['batch_size']
+        self._pred_batch_size = config.get('pred_batch_size', self._batch_size)
+        self._print_first_n = config.get('print_first_n', 1)
+        self._with_negative = config.get('with_negative', False)
+        self._sample_rate = config.get('sample_rate', 0.02)
+        # TODO: without slide window version
+        self._with_slide_window = config.get('with_slide_window', False)
+        self.vocab = self._tokenizer.vocab
+        self.vocab_size = len(self.vocab)
+        self.pad_id = self.vocab["[PAD]"]
+        self.cls_id = self.vocab["[CLS]"]
+        self.sep_id = self.vocab["[SEP]"]
+        self.mask_id = self.vocab["[MASK]"]
+        self.current_train_example = -1
+        self.num_train_examples = -1
+        self.current_train_epoch = -1
+        self.n_examples = None
+        print(print_prefix + 'reading raw data...')
+        with open(input_file, "r") as reader:
+            self.raw_data = json.load(reader)["data"]
+        print(print_prfix + 'done!')
+    @property
+    def outputs_attr(self):
+        if self._is_training:
+            return {"token_ids": [[-1, self.max_seq_len, 1], 'int64'],
+                    "position_ids": [[-1, self.max_seq_len, 1], 'int64'],
+                    "segment_ids": [[-1, self.max_seq_len, 1], 'int64'],
+                    "input_mask": [[-1, self.max_seq_len, 1], 'float32'],
+                    "start_positions": [[-1, self.max_seq_len, 1], 'int64'],
+                    "end_positions": [[-1, self.max_seq_len, 1], 'int64']
+                    }
+        else:
+            return {"token_ids": [[-1, self.max_seq_len, 1], 'int64'],
+                    "position_ids": [[-1, self.max_seq_len, 1], 'int64'],
+                    "segment_ids": [[-1, self.max_seq_len, 1], 'int64'],
+                    "input_mask": [[-1, self.max_seq_len, 1], 'float32'],
+                    "unique_ids": [[-1, 1], 'int64']
+                    }
+    def iterator(self): 
+        features = []
+        for i in self._num_epochs:
+            if self._is_training:
+                print(self.print_prefix + '{} epoch {} {}'.format('-'*16, i, '-'*16))
+            example_id = 0
+            feature_id = 1000000000
+            for line in self.train_file:
+                raw = self.parse_line(line)
+                examples = _raw_to_examples(raw['context'], raw['qa_list'], is_training=self._is_training)
+                for example in examples:
+                    features.extend(_example_to_features(example, example_id, self._tokenizer, \
+                                        self._max_seq_length, self._doc_stride, self._max_query_length, \
+                                        id_offset=1000000000+len(features), is_training=self._is_training))
+                    if len(features) >= self._batch_size * self._dev_count:
+                        for batch, total_token_num in _features_to_batches( \
+                                                        features[:self._batch_size * self._dev_count], \
+                                                        batch_size, in_tokens=self._in_tokens):
+                            temp = prepare_batch_data(batch, total_token_num, \
+                                    max_len=self._max_seq_length, voc_size=-1, \
+                                    pad_id=self.pad_id, cls_id=self.cls_id, sep_id=self.sep_id, mask_id=-1, \
+                                    return_input_mask=True, return_max_len=False, return_num_token=False)
+                            if self._is_training:
+                                tok_ids, pos_ids, seg_ids, input_mask, start_positions, end_positions = temp
+                                yield {"token_ids": tok_ids, "position_ids": pos_ids, "segment_ids": seg_ids, "input_mask": input_mask, "start_positions": start_positions, 'end_positions': end_positions}
+                            else:
+                                tok_ids, pos_ids, seg_ids, input_mask, unique_ids = temp
+                                yield {"token_ids": tok_ids, "position_ids": pos_ids, "segment_ids": seg_ids, "input_mask": input_mask, "unique_ids": unique_ids}
+                        features = features[self._batch_size * self._dev_count:]
+                    example_id += 1
+        # The last batch may be discarded when running with distributed prediction, so we build some fake batches for the last prediction step.
+        if self._is_training and len(features) > 0:
+            pred_batches = []
+            for batch, total_token_num in _features_to_batches( \
+                                            features[:self._batch_size * self._dev_count], \
+                                            batch_size, in_tokens=self._in_tokens):
+                pred_batches.append(prepare_batch_data(batch, total_token_num, max_len=self._max_seq_length, voc_size=-1,
+                                        pad_id=self.pad_id, cls_id=self.cls_id, sep_id=self.sep_id, mask_id=-1, \
+                                        return_input_mask=True, return_max_len=False, return_num_token=False))
+            fake_batch = pred_batches[-1]
+            fake_batch = fake_batch[:-1] + [np.array([-1]*len(fake_batch[0]))]
+            pred_batches = pred_batches + [fake_batch] * (dev_count - len(pred_batches))
+            for batch in pred_batches:
+                yield batch
+    @property
+    def num_examples(self):
+        if self.n_examples is None:
+            self.n_examples = _estimate_runtime_examples(self.raw_data, self._sample_rate, self._tokenizer, \
+                                  self._max_seq_length, self._doc_stride, self._max_query_length, \
+                                  remove_impossible_questions=True, filter_invalid_spans=True)
+        return self.n_examples
+        # return math.ceil(n_examples * self._num_epochs / float(self._batch_size * self._dev_count))
+def _raw_to_examples(context, qa_list, is_training=True, remove_impossible_questions=True, filter_invalid_spans=True):
+    """
+    Args:
+        context: (str) the paragraph that provide information for QA
+        qa_list: (list) nested dict. Each element in qa_list should contain at least 'id' and 'question'. And the ....
+        """
+    examples = []
+    doc_tokens = []
+    char_to_word_offset = []
+    prev_is_whitespace = True
+    for c in context:
+        if is_whitespace(c):
+            prev_is_whitespace = True
+        else:
+            if prev_is_whitespace:
+                doc_tokens.append(c)
+            else:
+                doc_tokens[-1] += c
+            prev_is_whitespace = False
+        char_to_word_offset.append(len(doc_tokens) - 1)
+    for qa in qa_list:
+        qas_id = qa["id"]
+        question_text = qa["question"]
+        start_position = None
+        end_position = None
+        orig_answer_text = None
+        is_impossible = False
+        if is_training:
+            assert len(qa["answers"]) == 1, "For training, each question should have exactly 1 answer."
+            if ('is_impossible' in qa) and (qa["is_impossible"]):
+                if remove_impossible_questions or filter_invalid_spans:
+                    continue
+                else:
+                    start_position = -1
+                    end_position = -1
+                    orig_answer_text = ""
+                    is_impossible = True
+            else:
+                answer = qa["answers"][0]
+                orig_answer_text = answer["text"]
+                answer_offset = answer["answer_start"]
+                answer_length = len(orig_answer_text)
+                start_position = char_to_word_offset[answer_offset]
+                end_position = char_to_word_offset[answer_offset +
+                                                   answer_length - 1]
+                # remove corrupt samples
+                actual_text = " ".join(doc_tokens[start_position:(
+                    end_position + 1)])
+                cleaned_answer_text = " ".join(
+                    tokenization.whitespace_tokenize(orig_answer_text))
+                if actual_text.find(cleaned_answer_text) == -1:
+                    print(self.print_prefix + "Could not find answer: '%s' vs. '%s'",
+                          actual_text, cleaned_answer_text)
+                    continue
+        examples.append(MRQAExample(
+            qas_id=qas_id,
+            question_text=question_text,
+            doc_tokens=doc_tokens,
+            orig_answer_text=orig_answer_text,
+            start_position=start_position,
+            end_position=end_position,
+            is_impossible=is_impossible))
+    return examples
+def _example_to_features(example, example_id, tokenizer, max_seq_length, doc_stride, max_query_length, id_offset, is_training):
+    query_tokens = tokenizer.tokenize(example.question_text)
+    if len(query_tokens) > max_query_length:
+        query_tokens = query_tokens[0:max_query_length]
+    tok_to_orig_index = []
+    orig_to_tok_index = []
+    all_doc_tokens = []
+    for (i, token) in enumerate(example.doc_tokens):
+        orig_to_tok_index.append(len(all_doc_tokens))
+        sub_tokens = tokenizer.tokenize(token)
+        for sub_token in sub_tokens:
+            tok_to_orig_index.append(i)
+            all_doc_tokens.append(sub_token)
+    tok_start_position = None
+    tok_end_position = None
+    if is_training and example.is_impossible:
+        tok_start_position = -1
+        tok_end_position = -1
+    if is_training and not example.is_impossible:
+        tok_start_position = orig_to_tok_index[example.start_position]
+        if example.end_position < len(example.doc_tokens) - 1:
+            tok_end_position = orig_to_tok_index[example.end_position +
+                                                 1] - 1
+        else:
+            tok_end_position = len(all_doc_tokens) - 1
+        (tok_start_position, tok_end_position) = _improve_answer_span(
+            all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
+            example.orig_answer_text)
+    # The -3 accounts for [CLS], [SEP] and [SEP]
+    max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
+    # We can have documents that are longer than the maximum sequence length.
+    # To deal with this we do a sliding window approach, where we take chunks
+    # of the up to our max length with a stride of `doc_stride`.
+    _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
+        "DocSpan", ["start", "length"])
+    doc_spans = []
+    start_offset = 0
+    while start_offset < len(all_doc_tokens):
+        length = len(all_doc_tokens) - start_offset
+        if length > max_tokens_for_doc:
+            length = max_tokens_for_doc
+        doc_spans.append(_DocSpan(start=start_offset, length=length))
+        if start_offset + length == len(all_doc_tokens):
+            break
+        start_offset += min(length, doc_stride)
+    for (doc_span_index, doc_span) in enumerate(doc_spans):
+        tokens = []
+        token_to_orig_map = {}
+        token_is_max_context = {}
+        segment_ids = []
+        tokens.append("[CLS]")
+        segment_ids.append(0)
+        for token in query_tokens:
+            tokens.append(token)
+            segment_ids.append(0)
+        tokens.append("[SEP]")
+        segment_ids.append(0)
+        for i in range(doc_span.length):
+            split_token_index = doc_span.start + i
+            token_to_orig_map[len(tokens)] = tok_to_orig_index[
+                split_token_index]
+            is_max_context = _check_is_max_context(
+                doc_spans, doc_span_index, split_token_index)
+            token_is_max_context[len(tokens)] = is_max_context
+            tokens.append(all_doc_tokens[split_token_index])
+            segment_ids.append(1)
+        tokens.append("[SEP]")
+        segment_ids.append(1)
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        input_mask = [1] * len(input_ids)
+        # Zero-pad up to the sequence length.
+        #while len(input_ids) < max_seq_length:
+        #  input_ids.append(0)
+        #  input_mask.append(0)
+        #  segment_ids.append(0)
+        #assert len(input_ids) == max_seq_length
+        #assert len(input_mask) == max_seq_length
+        #assert len(segment_ids) == max_seq_length
+        start_position = None
+        end_position = None
+        if is_training and not example.is_impossible:
+            # For training, if our document chunk does not contain an annotation
+            # we throw it out, since there is nothing to predict.
+            doc_start = doc_span.start
+            doc_end = doc_span.start + doc_span.length - 1
+            out_of_span = False
+            if not (tok_start_position >= doc_start and
+                    tok_end_position <= doc_end):
+                out_of_span = True
+            if out_of_span:
+                start_position = 0
+                end_position = 0
+                continue
+            else:
+                doc_offset = len(query_tokens) + 2
+                start_position = tok_start_position - doc_start + doc_offset
+                end_position = tok_end_position - doc_start + doc_offset
+        if is_training and example.is_impossible:
+            start_position = 0
+            end_position = 0
+        def format_print():
+            print("*** Example ***")
+            print("unique_id: %s" % (unique_id))
+            print("example_index: %s" % (example_index))
+            print("doc_span_index: %s" % (doc_span_index))
+            print("tokens: %s" % " ".join(
+                [tokenization.printable_text(x) for x in tokens]))
+            print("token_to_orig_map: %s" % " ".join([
+                "%d:%d" % (x, y)
+                for (x, y) in six.iteritems(token_to_orig_map)
+            ]))
+            print("token_is_max_context: %s" % " ".join([
+                "%d:%s" % (x, y)
+                for (x, y) in six.iteritems(token_is_max_context)
+            ]))
+            print("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+            print("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+            print("segment_ids: %s" %
+                  " ".join([str(x) for x in segment_ids]))
+            if is_training and example.is_impossible:
+                print("impossible example")
+            if is_training and not example.is_impossible:
+                answer_text = " ".join(tokens[start_position:(end_position +
+                                                              1)])
+                print("start_position: %d" % (start_position))
+                print("end_position: %d" % (end_position))
+                print("answer: %s" %
+                      (tokenization.printable_text(answer_text)))
+        if self._print_first_n > 0:
+            format_print()
+            self._print_first_n -= 1
+        features.append(MRQAFeature(
+            unique_id=id_offset,
+            example_index=example_id,
+            doc_span_index=doc_span_index,
+            tokens=tokens,
+            token_to_orig_map=token_to_orig_map,
+            token_is_max_context=token_is_max_context,
+            input_ids=input_ids,
+            input_mask=input_mask,
+            segment_ids=segment_ids,
+            start_position=start_position,
+            end_position=end_position,
+            is_impossible=example.is_impossible))
+        id_offset += 1
+    return features
+def _features_to_batches(features, batch_size, in_tokens):
+    batch, total_token_num, max_len = [], 0, 0
+    for (index, feature) in enumerate(features):
+        if phase == 'train':
+            self.current_train_example = index + 1
+        seq_len = len(feature.input_ids)
+        labels = [feature.unique_id
+                  ] if feature.start_position is None else [
+                      feature.start_position, feature.end_position
+                  ]
+        example = [
+            feature.input_ids, feature.segment_ids, range(seq_len)
+        ] + labels
+        max_len = max(max_len, seq_len)
+        if in_tokens:
+            to_append = (len(batch) + 1) * max_len <= batch_size
+        else:
+            to_append = len(batch) < batch_size
+        if to_append:
+            batch.append(example)
+            total_token_num += seq_len
+        else:
+            yield batch, total_token_num
+            batch, total_token_num, max_len = [example
+                                               ], seq_len, seq_len
+    if len(batch) > 0:
+        yield batch, total_token_num
+def _estimate_runtime_examples(data, sample_rate, tokenizer, \
+                              max_seq_length, doc_stride, max_query_length, \
+                              remove_impossible_questions=True, filter_invalid_spans=True):
+    """Count runtime examples which may differ from number of raw samples due to sliding window operation and etc.. 
+       This is useful to get correct warmup steps for training."""
+    assert sample_rate > 0.0 and sample_rate <= 1.0, "sample_rate must be set between 0.0~1.0"
+    num_raw_examples = 0
+    for entry in data:
+        for paragraph in entry["paragraphs"]:
+            paragraph_text = paragraph["context"]
+            for qa in paragraph["qas"]:
+                num_raw_examples += 1
+    # print("num raw examples:{}".format(num_raw_examples))
+    def is_whitespace(c):
+        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+            return True
+        return False
+    sampled_examples = []
+    first_samp = True
+    for entry in data:
+        for paragraph in entry["paragraphs"]:
+            doc_tokens = None
+            for qa in paragraph["qas"]:
+                if not first_samp and random.random() > sample_rate and sample_rate < 1.0:
+                    continue
+                if doc_tokens is None:
+                    paragraph_text = paragraph["context"]
+                    doc_tokens = []
+                    char_to_word_offset = []
+                    prev_is_whitespace = True
+                    for c in paragraph_text:
+                        if is_whitespace(c):
+                            prev_is_whitespace = True
+                        else:
+                            if prev_is_whitespace:
+                                doc_tokens.append(c)
+                            else:
+                                doc_tokens[-1] += c
+                            prev_is_whitespace = False
+                        char_to_word_offset.append(len(doc_tokens) - 1)
+                assert len(qa["answers"]) == 1, "For training, each question should have exactly 1 answer."
+                qas_id = qa["id"]
+                question_text = qa["question"]
+                start_position = None
+                end_position = None
+                orig_answer_text = None
+                is_impossible = False
+                if ('is_impossible' in qa) and (qa["is_impossible"]):
+                    if remove_impossible_questions or filter_invalid_spans:
+                        continue
+                    else:
+                        start_position = -1
+                        end_position = -1
+                        orig_answer_text = ""
+                        is_impossible = True
+                else:
+                    answer = qa["answers"][0]
+                    orig_answer_text = answer["text"]
+                    answer_offset = answer["answer_start"]
+                    answer_length = len(orig_answer_text)
+                    start_position = char_to_word_offset[answer_offset]
+                    end_position = char_to_word_offset[answer_offset +
+                                                       answer_length - 1]
+                    # remove corrupt samples
+                    actual_text = " ".join(doc_tokens[start_position:(
+                        end_position + 1)])
+                    cleaned_answer_text = " ".join(
+                        tokenization.whitespace_tokenize(orig_answer_text))
+                    if actual_text.find(cleaned_answer_text) == -1:
+                        continue
+                example = MRQAExample(
+                    qas_id=qas_id,
+                    question_text=question_text,
+                    doc_tokens=doc_tokens,
+                    orig_answer_text=orig_answer_text,
+                    start_position=start_position,
+                    end_position=end_position,
+                    is_impossible=is_impossible)
+                sampled_examples.append(example)
+                first_samp = False
+    runtime_sample_rate = len(sampled_examples) / float(num_raw_examples)
+    runtime_samp_cnt = 0
+    for example in sampled_examples:
+        query_tokens = tokenizer.tokenize(example.question_text)
+        if len(query_tokens) > max_query_length:
+            query_tokens = query_tokens[0:max_query_length]
+        tok_to_orig_index = []
+        orig_to_tok_index = []
+        all_doc_tokens = []
+        for (i, token) in enumerate(example.doc_tokens):
+            orig_to_tok_index.append(len(all_doc_tokens))
+            sub_tokens = tokenizer.tokenize(token)
+            for sub_token in sub_tokens:
+                tok_to_orig_index.append(i)
+                all_doc_tokens.append(sub_token)
+        tok_start_position = None
+        tok_end_position = None
+        tok_start_position = orig_to_tok_index[example.start_position]
+        if example.end_position < len(example.doc_tokens) - 1:
+            tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
+        else:
+            tok_end_position = len(all_doc_tokens) - 1
+        (tok_start_position, tok_end_position) = _improve_answer_span(
+            all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
+            example.orig_answer_text)
+        # The -3 accounts for [CLS], [SEP] and [SEP]
+        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
+        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
+            "DocSpan", ["start", "length"])
+        doc_spans = []
+        start_offset = 0
+        while start_offset < len(all_doc_tokens):
+            length = len(all_doc_tokens) - start_offset
+            if length > max_tokens_for_doc:
+                length = max_tokens_for_doc
+            doc_spans.append(_DocSpan(start=start_offset, length=length))
+            if start_offset + length == len(all_doc_tokens):
+                break
+            start_offset += min(length, doc_stride)
+        for (doc_span_index, doc_span) in enumerate(doc_spans):
+            doc_start = doc_span.start
+            doc_end = doc_span.start + doc_span.length - 1
+            if filter_invalid_spans and not (tok_start_position >= doc_start and tok_end_position <= doc_end):
+                continue
+            runtime_samp_cnt += 1
+    return int(runtime_samp_cnt/runtime_sample_rate)
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
+                         orig_answer_text):
+    """Returns tokenized answer spans that better match the annotated answer."""
+    # The MRQA annotations are character based. We first project them to
+    # whitespace-tokenized words. But then after WordPiece tokenization, we can
+    # often find a "better match". For example:
+    #
+    #   Question: What year was John Smith born?
+    #   Context: The leader was John Smith (1895-1943).
+    #   Answer: 1895
+    #
+    # The original whitespace-tokenized answer will be "(1895-1943).". However
+    # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
+    # the exact answer, 1895.
+    #
+    # However, this is not always possible. Consider the following:
+    #
+    #   Question: What country is the top exporter of electornics?
+    #   Context: The Japanese electronics industry is the lagest in the world.
+    #   Answer: Japan
+    #
+    # In this case, the annotator chose "Japan" as a character sub-span of
+    # the word "Japanese". Since our WordPiece tokenizer does not split
+    # "Japanese", we just use "Japanese" as the annotation. This is fairly rare
+    # in MRQA, but does happen.
+    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
+    for new_start in range(input_start, input_end + 1):
+        for new_end in range(input_end, new_start - 1, -1):
+            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
+            if text_span == tok_answer_text:
+                return (new_start, new_end)
+    return (input_start, input_end)
+def _check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token."""
+    # Because of the sliding window approach taken to scoring documents, a single
+    # token can appear in multiple documents. E.g.
+    #  Doc: the man went to the store and bought a gallon of milk
+    #  Span A: the man went to the
+    #  Span B: to the store and bought
+    #  Span C: and bought a gallon of
+    #  ...
+    #
+    # Now the word 'bought' will have two scores from spans B and C. We only
+    # want to consider the score with "maximum context", which we define as
+    # the *minimum* of its left and right context (the *sum* of left and
+    # right context will always be the same, of course).
+    #
+    # In the example the maximum context for 'bought' would be span C since
+    # it has 1 left context and 3 right context, while span B has 4 left context
+    # and 0 right context.
+    best_score = None
+    best_span_index = None
+    for (span_index, doc_span) in enumerate(doc_spans):
+        end = doc_span.start + doc_span.length - 1
+        if position < doc_span.start:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span.start
+        num_right_context = end - position
+        score = min(num_left_context,
+                    num_right_context) + 0.01 * doc_span.length
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+    return cur_span_index == best_span_index
--- a/paddlepalm/reader/mrc4ernie.py
+++ b/paddlepalm/reader/mrc4ernie.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddlepalm.interface import reader
+from paddlepalm.reader.utils.reader4ernie import MRCReader
+class Reader(reader):
+    def __init__(self, config, phase='train', dev_count=1, print_prefix=''):
+        """
+        Args:
+            phase: train, eval, pred
+            """
+        self._is_training = phase == 'train'
+        reader = MRCReader(config['vocab_path'],
+            max_seq_len=config['max_seq_len'],
+            do_lower_case=config.get('do_lower_case', False),
+            tokenizer='FullTokenizer',
+            doc_stride=config['doc_stride'],
+            max_query_length=config['max_query_len'],
+            random_seed=config.get('seed', None))
+        self._reader = reader
+        self._dev_count = dev_count
+        self._batch_size = config['batch_size']
+        self._max_seq_len = config['max_seq_len']
+        if phase == 'train':
+            self._input_file = config['train_file']
+            # self._num_epochs = config['num_epochs']
+            self._num_epochs = None # 防止iteartor终止
+            self._shuffle = config.get('shuffle', False)
+            self._shuffle_buffer = config.get('shuffle_buffer', 5000)
+        if phase == 'eval':
+            self._input_file = config['dev_file']
+            self._num_epochs = 1
+            self._shuffle = False
+            self._batch_size = config.get('pred_batch_size', self._batch_size)
+        elif phase == 'pred':
+            self._input_file = config['pred_file']
+            self._num_epochs = 1
+            self._shuffle = False
+            self._batch_size = config.get('pred_batch_size', self._batch_size)
+        self._phase = phase
+        # self._batch_size = 
+        self._print_first_n = config.get('print_first_n', 1)
+        # TODO: without slide window version
+        self._with_slide_window = config.get('with_slide_window', False)
+    @property
+    def outputs_attr(self):
+        if self._is_training:
+            return {"token_ids": [[-1, -1, 1], 'int64'],
+                    "position_ids": [[-1, -1, 1], 'int64'],
+                    "segment_ids": [[-1, -1, 1], 'int64'],
+                    "input_mask": [[-1, -1, 1], 'float32'],
+                    "start_positions": [[-1, 1], 'int64'],
+                    "end_positions": [[-1, 1], 'int64'],
+                    "task_ids": [[-1, -1, 1], 'int64']
+                    }
+        else:
+            return {"token_ids": [[-1, -1, 1], 'int64'],
+                    "position_ids": [[-1, -1, 1], 'int64'],
+                    "segment_ids": [[-1, -1, 1], 'int64'],
+                    "task_ids": [[-1, -1, 1], 'int64'],
+                    "input_mask": [[-1, -1, 1], 'float32'],
+                    "unique_ids": [[-1, 1], 'int64']
+                    }
+    @property
+    def epoch_outputs_attr(self):
+        if not self._is_training:
+            return {"examples": None,
+                    "features": None}
+    def load_data(self):
+        self._data_generator = self._reader.data_generator(self._input_file, self._batch_size, self._num_epochs, dev_count=self._dev_count, shuffle=self._shuffle, phase=self._phase)
+    def iterator(self): 
+        def list_to_dict(x):
+            names = ['token_ids', 'segment_ids', 'position_ids', 'task_ids', 'input_mask', 
+                'start_positions', 'end_positions', 'unique_ids']
+            outputs = {n: i for n,i in zip(names, x)}
+            if self._is_training:
+                del outputs['unique_ids']
+            else:
+                del outputs['start_positions']
+                del outputs['end_positions']
+            return outputs
+        for batch in self._data_generator():
+            yield list_to_dict(batch)
+    def get_epoch_outputs(self):
+        return {'examples': self._reader.get_examples(self._phase),
+                'features': self._reader.get_features(self._phase)}
+    @property
+    def num_examples(self):
+        return self._reader.get_num_examples(phase=self._phase)
--- a/paddlepalm/reader/utils/__init__.py
+++ b/paddlepalm/reader/utils/__init__.py
--- a/utils/batching.py
+++ b/utils/batching.py
+# -*- coding: UTF-8 -*-
 #   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -11,7 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# -*- coding: utf-8 -*-
 """Mask, padding and batching."""
 from __future__ import absolute_import
 from __future__ import division

--- a/paddlepalm/reader/utils/batching4ernie.py
+++ b/paddlepalm/reader/utils/batching4ernie.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Mask, padding and batching."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+from six.moves import xrange
+def mask(batch_tokens,
+         seg_labels,
+         mask_word_tags,
+         total_token_num,
+         vocab_size,
+         CLS=1,
+         SEP=2,
+         MASK=3):
+    """
+    Add mask for batch_tokens, return out, mask_label, mask_pos;
+    Note: mask_pos responding the batch_tokens after padded;
+    """
+    max_len = max([len(sent) for sent in batch_tokens])
+    mask_label = []
+    mask_pos = []
+    prob_mask = np.random.rand(total_token_num)
+    # Note: the first token is [CLS], so [low=1]
+    replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
+    pre_sent_len = 0
+    prob_index = 0
+    for sent_index, sent in enumerate(batch_tokens):
+        mask_flag = False
+        mask_word = mask_word_tags[sent_index]
+        prob_index += pre_sent_len
+        if mask_word:
+            beg = 0
+            for token_index, token in enumerate(sent):
+                seg_label = seg_labels[sent_index][token_index]
+                if seg_label == 1:
+                    continue
+                if beg == 0:
+                    if seg_label != -1:
+                        beg = token_index
+                    continue
+                prob = prob_mask[prob_index + beg]
+                if prob > 0.15:
+                    pass
+                else:
+                    for index in xrange(beg, token_index):
+                        prob = prob_mask[prob_index + index]
+                        base_prob = 1.0
+                        if index == beg:
+                            base_prob = 0.15
+                        if base_prob * 0.2 < prob <= base_prob:
+                            mask_label.append(sent[index])
+                            sent[index] = MASK
+                            mask_flag = True
+                            mask_pos.append(sent_index * max_len + index)
+                        elif base_prob * 0.1 < prob <= base_prob * 0.2:
+                            mask_label.append(sent[index])
+                            sent[index] = replace_ids[prob_index + index]
+                            mask_flag = True
+                            mask_pos.append(sent_index * max_len + index)
+                        else:
+                            mask_label.append(sent[index])
+                            mask_pos.append(sent_index * max_len + index)
+                if seg_label == -1:
+                    beg = 0
+                else:
+                    beg = token_index
+        else:
+            for token_index, token in enumerate(sent):
+                prob = prob_mask[prob_index + token_index]
+                if prob > 0.15:
+                    continue
+                elif 0.03 < prob <= 0.15:
+                    # mask
+                    if token != SEP and token != CLS:
+                        mask_label.append(sent[token_index])
+                        sent[token_index] = MASK
+                        mask_flag = True
+                        mask_pos.append(sent_index * max_len + token_index)
+                elif 0.015 < prob <= 0.03:
+                    # random replace
+                    if token != SEP and token != CLS:
+                        mask_label.append(sent[token_index])
+                        sent[token_index] = replace_ids[prob_index +
+                                                        token_index]
+                        mask_flag = True
+                        mask_pos.append(sent_index * max_len + token_index)
+                else:
+                    # keep the original token
+                    if token != SEP and token != CLS:
+                        mask_label.append(sent[token_index])
+                        mask_pos.append(sent_index * max_len + token_index)
+        pre_sent_len = len(sent)
+    mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
+    mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
+    return batch_tokens, mask_label, mask_pos
+def pad_batch_data(insts,
+                   pad_idx=0,
+                   return_pos=False,
+                   return_input_mask=False,
+                   return_max_len=False,
+                   return_num_token=False,
+                   return_seq_lens=False):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and attention bias.
+    """
+    return_list = []
+    max_len = max(len(inst) for inst in insts)
+    # Any token included in dict can be used to pad, since the paddings' loss
+    # will be masked out by weights and make no effect on parameter gradients.
+    inst_data = np.array(
+        [inst + list([pad_idx] * (max_len - len(inst))) for inst in insts])
+    return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
+    # position data
+    if return_pos:
+        inst_pos = np.array([
+            list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
+            for inst in insts
+        ])
+        return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
+    if return_input_mask:
+        # This is used to avoid attention on paddings.
+        input_mask_data = np.array([[1] * len(inst) + [0] *
+                                    (max_len - len(inst)) for inst in insts])
+        input_mask_data = np.expand_dims(input_mask_data, axis=-1)
+        return_list += [input_mask_data.astype("float32")]
+    if return_max_len:
+        return_list += [max_len]
+    if return_num_token:
+        num_token = 0
+        for inst in insts:
+            num_token += len(inst)
+        return_list += [num_token]
+    if return_seq_lens:
+        seq_lens = np.array([len(inst) for inst in insts])
+        return_list += [seq_lens.astype("int64").reshape([-1, 1])]
+    return return_list if len(return_list) > 1 else return_list[0]
+if __name__ == "__main__":
+    pass
--- a/paddlepalm/reader/utils/mlm_batching.py
+++ b/paddlepalm/reader/utils/mlm_batching.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Mask, padding and batching."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
+    """
+    Add mask for batch_tokens, return out, mask_label, mask_pos;
+    Note: mask_pos responding the batch_tokens after padded;
+    """
+    max_len = max([len(sent) for sent in batch_tokens])
+    mask_label = []
+    mask_pos = []
+    prob_mask = np.random.rand(total_token_num)
+    # Note: the first token is [CLS], so [low=1]
+    replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num)
+    pre_sent_len = 0
+    prob_index = 0
+    for sent_index, sent in enumerate(batch_tokens):
+        mask_flag = False
+        prob_index += pre_sent_len
+        for token_index, token in enumerate(sent):
+            prob = prob_mask[prob_index + token_index]
+            if prob > 0.15:
+                continue
+            elif 0.03 < prob <= 0.15:
+                # mask
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    sent[token_index] = MASK
+                    mask_flag = True
+                    mask_pos.append(sent_index * max_len + token_index)
+            elif 0.015 < prob <= 0.03:
+                # random replace
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    sent[token_index] = replace_ids[prob_index + token_index]
+                    mask_flag = True
+                    mask_pos.append(sent_index * max_len + token_index)
+            else:
+                # keep the original token
+                if token != SEP and token != CLS:
+                    mask_label.append(sent[token_index])
+                    mask_pos.append(sent_index * max_len + token_index)
+        pre_sent_len = len(sent)
+        # ensure at least mask one word in a sentence
+        while not mask_flag:
+            token_index = int(np.random.randint(1, high=len(sent) - 1, size=1))
+            if sent[token_index] != SEP and sent[token_index] != CLS:
+                mask_label.append(sent[token_index])
+                sent[token_index] = MASK
+                mask_flag = True
+                mask_pos.append(sent_index * max_len + token_index)
+    mask_label = np.array(mask_label).astype("int64").reshape([-1, 1])
+    mask_pos = np.array(mask_pos).astype("int64").reshape([-1, 1])
+    return batch_tokens, mask_label, mask_pos
+def prepare_batch_data(insts,
+                       total_token_num,
+                       max_len=None,
+                       voc_size=0,
+                       pad_id=None,
+                       cls_id=None,
+                       sep_id=None,
+                       mask_id=None,
+                       task_id=0,
+                       return_input_mask=True,
+                       return_max_len=True,
+                       return_num_token=False):
+    """
+    1. generate Tensor of data
+    2. generate Tensor of position
+    3. generate self attention mask, [shape: batch_size *  max_len * max_len]
+    """
+    batch_src_ids = [inst[0] for inst in insts]
+    batch_sent_ids = [inst[1] for inst in insts]
+    batch_pos_ids = [inst[2] for inst in insts]
+    # First step: do mask without padding
+    out, mask_label, mask_pos = mask(
+        batch_src_ids,
+        total_token_num,
+        vocab_size=voc_size,
+        CLS=cls_id,
+        SEP=sep_id,
+        MASK=mask_id)
+    # Second step: padding
+    src_id, self_input_mask = pad_batch_data(
+        out, 
+        max_len=max_len,
+        pad_idx=pad_id, return_input_mask=True)
+    pos_id = pad_batch_data(
+        batch_pos_ids,
+        max_len=max_len,
+        pad_idx=pad_id,
+        return_pos=False,
+        return_input_mask=False)
+    sent_id = pad_batch_data(
+        batch_sent_ids,
+        max_len=max_len,
+        pad_idx=pad_id,
+        return_pos=False,
+        return_input_mask=False)
+    task_ids = np.ones_like(
+        src_id, dtype="int64") * task_id
+    return_list = [
+        src_id, pos_id, sent_id, self_input_mask, task_ids, mask_label, mask_pos
+    ]
+    return return_list if len(return_list) > 1 else return_list[0]
+def pad_batch_data(insts,
+                   max_len=None,
+                   pad_idx=0,
+                   return_pos=False,
+                   return_input_mask=False,
+                   return_max_len=False,
+                   return_num_token=False):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and input mask.
+    """
+    return_list = []
+    if max_len is None:
+        max_len = max(len(inst) for inst in insts)
+    # Any token included in dict can be used to pad, since the paddings' loss
+    # will be masked out by weights and make no effect on parameter gradients.
+    inst_data = np.array([
+        list(inst) + list([pad_idx] * (max_len - len(inst))) for inst in insts
+    ])
+    return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
+    # position data
+    if return_pos:
+        inst_pos = np.array([
+            list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst))
+            for inst in insts
+        ])
+        return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
+    if return_input_mask:
+        # This is used to avoid attention on paddings.
+        input_mask_data = np.array([[1] * len(inst) + [0] *
+                                    (max_len - len(inst)) for inst in insts])
+        input_mask_data = np.expand_dims(input_mask_data, axis=-1)
+        return_list += [input_mask_data.astype("float32")]
+    if return_max_len:
+        return_list += [max_len]
+    if return_num_token:
+        num_token = 0
+        for inst in insts:
+            num_token += len(inst)
+        return_list += [num_token]
+    return return_list if len(return_list) > 1 else return_list[0]
+if __name__ == "__main__":
+    pass
--- a/paddlepalm/reader/utils/mrqa_helper.py
+++ b/paddlepalm/reader/utils/mrqa_helper.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+class MRQAExample(object):
+    """A single training/test example for simple sequence classification.
+     For examples without an answer, the start and end position are -1.
+  """
+    def __init__(self,
+                 qas_id,
+                 question_text,
+                 doc_tokens,
+                 orig_answer_text=None,
+                 start_position=None,
+                 end_position=None,
+                 is_impossible=False):
+        self.qas_id = qas_id
+        self.question_text = question_text
+        self.doc_tokens = doc_tokens
+        self.orig_answer_text = orig_answer_text
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+    def __str__(self):
+        return self.__repr__()
+    def __repr__(self):
+        s = ""
+        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
+        s += ", question_text: %s" % (
+            tokenization.printable_text(self.question_text))
+        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
+        if self.start_position:
+            s += ", start_position: %d" % (self.start_position)
+        if self.start_position:
+            s += ", end_position: %d" % (self.end_position)
+        if self.start_position:
+            s += ", is_impossible: %r" % (self.is_impossible)
+        return s
+class MRQAFeature(object):
+    """A single set of features of data."""
+    def __init__(self,
+                 unique_id,
+                 example_index,
+                 doc_span_index,
+                 tokens,
+                 token_to_orig_map,
+                 token_is_max_context,
+                 input_ids,
+                 input_mask,
+                 segment_ids,
+                 start_position=None,
+                 end_position=None,
+                 is_impossible=None):
+        self.unique_id = unique_id
+        self.example_index = example_index
+        self.doc_span_index = doc_span_index
+        self.tokens = tokens
+        self.token_to_orig_map = token_to_orig_map
+        self.token_is_max_context = token_is_max_context
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
--- a/paddlepalm/reader/utils/reader4ernie.py
+++ b/paddlepalm/reader/utils/reader4ernie.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+import sys
+import os
+import json
+import random
+import logging
+import numpy as np
+import six
+from io import open
+from collections import namedtuple
+import paddlepalm.tokenizer.ernie_tokenizer as tokenization
+from paddlepalm.reader.utils.batching4ernie import pad_batch_data
+from paddlepalm.reader.utils.mlm_batching import prepare_batch_data
+log = logging.getLogger(__name__)
+if six.PY3:
+    import io
+    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8')
+def csv_reader(fd, delimiter='\t'):
+    def gen():
+        for i in fd:
+            slots = i.rstrip('\n').split(delimiter)
+            if len(slots) == 1:
+                yield slots,
+            else:
+                yield slots
+    return gen()
+class BaseReader(object):
+    def __init__(self,
+                 vocab_path,
+                 label_map_config=None,
+                 max_seq_len=512,
+                 do_lower_case=True,
+                 in_tokens=False,
+                 is_inference=False,
+                 random_seed=None,
+                 tokenizer="FullTokenizer",
+                 is_classify=True,
+                 is_regression=False,
+                 for_cn=True,
+                 task_id=0):
+        self.max_seq_len = max_seq_len
+        self.tokenizer = tokenization.FullTokenizer(
+            vocab_file=vocab_path, do_lower_case=do_lower_case)
+        self.vocab = self.tokenizer.vocab
+        self.pad_id = self.vocab["[PAD]"]
+        self.cls_id = self.vocab["[CLS]"]
+        self.sep_id = self.vocab["[SEP]"]
+        self.in_tokens = in_tokens
+        self.is_inference = is_inference
+        self.for_cn = for_cn
+        self.task_id = task_id
+        np.random.seed(random_seed)
+        self.is_classify = is_classify
+        self.is_regression = is_regression
+        self.current_example = 0
+        self.current_epoch = 0
+        self.num_examples = 0
+        self.examples = {}
+        if label_map_config:
+            with open(label_map_config, encoding='utf8') as f: 
+                self.label_map = json.load(f)
+        else:
+            self.label_map = None
+    def get_train_progress(self):
+        """Gets progress for training phase."""
+        return self.current_example, self.current_epoch
+    def _read_tsv(self, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with open(input_file, 'r', encoding='utf8') as f:
+            reader = csv_reader(f)
+            headers = next(reader)
+            Example = namedtuple('Example', headers)
+            examples = []
+            for line in reader:
+                example = Example(*line)
+                examples.append(example)
+            return examples
+    def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
+        """Truncates a sequence pair in place to the maximum length."""
+        # This is a simple heuristic which will always truncate the longer sequence
+        # one token at a time. This makes more sense than truncating an equal percent
+        # of tokens from each, since if one sequence is very short then each token
+        # that's truncated likely contains more information than a longer sequence.
+        while True:
+            total_length = len(tokens_a) + len(tokens_b)
+            if total_length <= max_length:
+                break
+            if len(tokens_a) > len(tokens_b):
+                tokens_a.pop()
+            else:
+                tokens_b.pop()
+    def _convert_example_to_record(self, example, max_seq_length, tokenizer):
+        """Converts a single `Example` into a single `Record`."""
+        text_a = tokenization.convert_to_unicode(example.text_a)
+        tokens_a = tokenizer.tokenize(text_a)
+        tokens_b = None
+        has_text_b = False
+        if isinstance(example, dict):
+            has_text_b = "text_b" in example.keys()
+        else:
+            has_text_b = "text_b" in example._fields
+        if has_text_b:
+            text_b = tokenization.convert_to_unicode(example.text_b)
+            tokens_b = tokenizer.tokenize(text_b)
+        if tokens_b:
+            # Modifies `tokens_a` and `tokens_b` in place so that the total
+            # length is less than the specified length.
+            # Account for [CLS], [SEP], [SEP] with "- 3"
+            self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+        else:
+            # Account for [CLS] and [SEP] with "- 2"
+            if len(tokens_a) > max_seq_length - 2:
+                tokens_a = tokens_a[0:(max_seq_length - 2)]
+        # The convention in BERT/ERNIE is:
+        # (a) For sequence pairs:
+        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+        #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
+        # (b) For single sequences:
+        #  tokens:   [CLS] the dog is hairy . [SEP]
+        #  type_ids: 0     0   0   0  0     0 0
+        #
+        # Where "type_ids" are used to indicate whether this is the first
+        # sequence or the second sequence. The embedding vectors for `type=0` and
+        # `type=1` were learned during pre-training and are added to the wordpiece
+        # embedding vector (and position vector). This is not *strictly* necessary
+        # since the [SEP] token unambiguously separates the sequences, but it makes
+        # it easier for the model to learn the concept of sequences.
+        #
+        # For classification tasks, the first vector (corresponding to [CLS]) is
+        # used as as the "sentence vector". Note that this only makes sense because
+        # the entire model is fine-tuned.
+        tokens = []
+        text_type_ids = []
+        tokens.append("[CLS]")
+        text_type_ids.append(0)
+        for token in tokens_a:
+            tokens.append(token)
+            text_type_ids.append(0)
+        tokens.append("[SEP]")
+        text_type_ids.append(0)
+        if tokens_b:
+            for token in tokens_b:
+                tokens.append(token)
+                text_type_ids.append(1)
+            tokens.append("[SEP]")
+            text_type_ids.append(1)
+        token_ids = tokenizer.convert_tokens_to_ids(tokens)
+        position_ids = list(range(len(token_ids)))
+        if self.is_inference:
+            Record = namedtuple('Record',
+                                ['token_ids', 'text_type_ids', 'position_ids'])
+            record = Record(
+                token_ids=token_ids,
+                text_type_ids=text_type_ids,
+                position_ids=position_ids)
+        else:
+            if self.label_map:
+                label_id = self.label_map[example.label]
+            else:
+                label_id = example.label
+            Record = namedtuple('Record', [
+                'token_ids', 'text_type_ids', 'position_ids', 'label_id', 'qid'
+            ])
+            qid = None
+            if "qid" in example._fields:
+                qid = example.qid
+            record = Record(
+                token_ids=token_ids,
+                text_type_ids=text_type_ids,
+                position_ids=position_ids,
+                label_id=label_id,
+                qid=qid)
+        return record
+    def _prepare_batch_data(self, examples, batch_size, phase=None):
+        """generate batch records"""
+        batch_records, max_len = [], 0
+        for index, example in enumerate(examples):
+            if phase == "train":
+                self.current_example = index
+            record = self._convert_example_to_record(example, self.max_seq_len,
+                                                     self.tokenizer)
+            max_len = max(max_len, len(record.token_ids))
+            if self.in_tokens:
+                to_append = (len(batch_records) + 1) * max_len <= batch_size
+            else:
+                to_append = len(batch_records) < batch_size
+            if to_append:
+                batch_records.append(record)
+            else:
+                yield self._pad_batch_records(batch_records)
+                batch_records, max_len = [record], len(record.token_ids)
+        if phase == 'pred' and batch_records:
+            print('the last batch yielded.')
+            yield self._pad_batch_records(batch_records)
+    def get_num_examples(self, input_file=None, phase=None):
+        if self.examples is not None:
+            if phase is None:
+                phase = 'all'
+            return len(self.examples[phase])
+        else:
+            assert input_file is not None, "Argument input_file should be given or the data_generator should be created when this func is called."
+            examples = self._read_tsv(input_file)
+            return len(examples)
+    def data_generator(self,
+                       input_file,
+                       batch_size,
+                       epoch,
+                       dev_count=1,
+                       shuffle=True,
+                       phase=None):
+        examples = self._read_tsv(input_file)
+        if phase is None:
+            phase = 'all'
+        self.examples[phase] = examples
+        def wrapper():
+            all_dev_batches = []
+            if epoch is None:
+                num_epochs = 99999999
+            else:
+                num_epochs = epoch
+            for epoch_index in range(num_epochs):
+                if phase == "train":
+                    self.current_example = 0
+                    self.current_epoch = epoch_index
+                if shuffle:
+                    np.random.shuffle(examples)
+                for batch_data in self._prepare_batch_data(
+                        examples, batch_size, phase=phase):
+                    if len(all_dev_batches) < dev_count:
+                        all_dev_batches.append(batch_data)
+                    if len(all_dev_batches) == dev_count:
+                        for batch in all_dev_batches:
+                            yield batch
+                        all_dev_batches = []
+        def f():
+            for i in wrapper():
+                yield i
+        # def f():
+        #     try:
+        #         for i in wrapper():
+        #             yield i
+        #     except Exception as e:
+        #         import traceback
+        #         traceback.print_exc()
+        return f
+class MaskLMReader(BaseReader):
+    def _convert_example_to_record(self, example, max_seq_length, tokenizer):
+        """Converts a single `Example` into a single `Record`."""
+        text_a = tokenization.convert_to_unicode(example.text_a)
+        tokens_a = tokenizer.tokenize(text_a)
+        tokens_b = None 
+        has_text_b = False
+        if isinstance(example, dict):
+            has_text_b = "text_b" in example.keys()
+        else:
+            has_text_b = "text_b" in example._fields
+        if has_text_b:
+            text_b = tokenization.convert_to_unicode(example.text_b)
+            tokens_b = tokenizer.tokenize(text_b)
+        if tokens_b:
+            # Modifies `tokens_a` and `tokens_b` in place so that the total
+            # length is less than the specified length.
+            # Account for [CLS], [SEP], [SEP] with "- 3"
+            self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+        else:
+            # Account for [CLS] and [SEP] with "- 2"
+            if len(tokens_a) > max_seq_length - 2:
+                tokens_a = tokens_a[0:(max_seq_length - 2)]
+        # The convention in BERT/ERNIE is:
+        # (a) For sequence pairs:
+        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+        #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
+        # (b) For single sequences:
+        #  tokens:   [CLS] the dog is hairy . [SEP]
+        #  type_ids: 0     0   0   0  0     0 0
+        #
+        # Where "type_ids" are used to indicate whether this is the first
+        # sequence or the second sequence. The embedding vectors for `type=0` and
+        # `type=1` were learned during pre-training and are added to the wordpiece
+        # embedding vector (and position vector). This is not *strictly* necessary
+        # since the [SEP] token unambiguously separates the sequences, but it makes
+        # it easier for the model to learn the concept of sequences.
+        #
+        # For classification tasks, the first vector (corresponding to [CLS]) is
+        # used as as the "sentence vector". Note that this only makes sense because
+        # the entire model is fine-tuned.
+        tokens = []
+        text_type_ids = []
+        tokens.append("[CLS]")
+        text_type_ids.append(0)
+        for token in tokens_a:
+            tokens.append(token)
+            text_type_ids.append(0)
+        tokens.append("[SEP]")
+        text_type_ids.append(0)
+        if tokens_b:
+            for token in tokens_b:
+                tokens.append(token)
+                text_type_ids.append(1)
+            tokens.append("[SEP]")
+            text_type_ids.append(1)
+        token_ids = tokenizer.convert_tokens_to_ids(tokens)
+        position_ids = list(range(len(token_ids)))
+        Record = namedtuple('Record',
+                            ['token_ids', 'text_type_ids', 'position_ids'])
+        record = Record(
+            token_ids=token_ids,
+            text_type_ids=text_type_ids,
+            position_ids=position_ids)
+        return record
+    def batch_reader(examples, batch_size, in_tokens, phase):
+        batch, total_token_num, max_len = [], 0, 0
+        for e in examples:
+            token_ids, sent_ids, pos_ids = _convert_example_to_record(e, self.max_seq_len, self.tokenizer)
+            max_len = max(max_len, len(token_ids))
+            if in_tokens:
+                to_append = (len(batch) + 1) * max_len <= batch_size
+            else:
+                to_append = len(batch) < batch_size
+            if to_append:
+                batch.append(parsed_line)
+                total_token_num += len(token_ids)
+            else:
+                yield batch, total_token_num
+                batch, total_token_num, max_len = [parsed_line], len(
+                    token_ids), len(token_ids)
+        if len(batch) > 0 and phase == 'pred':
+            yield batch, total_token_num
+    def data_generator(self,
+                       input_file,
+                       batch_size,
+                       epoch,
+                       dev_count=1,
+                       shuffle=True,
+                       phase=None):
+        examples = self._read_tsv(input_file)
+        if phase is None:
+            phase = 'all'
+        self.examples[phase] = examples
+        def wrapper():
+            all_dev_batches = []
+            if epoch is None:
+                num_epochs = 99999999
+            else:
+                num_epochs = epoch
+            for epoch_index in range(num_epochs):
+                if phase == "train":
+                    self.current_example = 0
+                    self.current_epoch = epoch_index
+                if shuffle:
+                    np.random.shuffle(examples)
+                all_dev_batches = []
+                for batch_data, total_token_num in batch_reader(examples, 
+                                                    self.batch_size, self.in_tokens, phase=phase):
+                    batch_data = prepare_batch_data(
+                        batch_data,
+                        total_token_num,
+                        voc_size=self.voc_size,
+                        pad_id=self.pad_id,
+                        cls_id=self.cls_id,
+                        sep_id=self.sep_id,
+                        mask_id=self.mask_id,
+                        max_len=self.max_seq_len,
+                        return_input_mask=True,
+                        return_max_len=False,
+                        return_num_token=False)
+                    if len(all_dev_batches) < dev_count:
+                        all_dev_batches.append(batch_data)
+                    if len(all_dev_batches) == dev_count:
+                        for batch in all_dev_batches:
+                            yield batch
+                        all_dev_batches = []
+        return wrapper
+class ClassifyReader(BaseReader):
+    def _read_tsv(self, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with open(input_file, 'r', encoding='utf8') as f:
+            reader = csv_reader(f)
+            headers = next(reader)
+            text_indices = [
+                index for index, h in enumerate(headers) if h != "label"
+            ]
+            Example = namedtuple('Example', headers)
+            examples = []
+            for line in reader:
+                for index, text in enumerate(line):
+                    if index in text_indices:
+                        if self.for_cn:
+                            line[index] = text.replace(' ', '')
+                        else:
+                            line[index] = text
+                example = Example(*line)
+                examples.append(example)
+            return examples
+    def _pad_batch_records(self, batch_records):
+        batch_token_ids = [record.token_ids for record in batch_records]
+        batch_text_type_ids = [record.text_type_ids for record in batch_records]
+        batch_position_ids = [record.position_ids for record in batch_records]
+        if not self.is_inference:
+            batch_labels = [record.label_id for record in batch_records]
+            if self.is_classify:
+                batch_labels = np.array(batch_labels).astype("int64").reshape(
+                    [-1, 1])
+            elif self.is_regression:
+                batch_labels = np.array(batch_labels).astype("float32").reshape(
+                    [-1, 1])
+            if batch_records[0].qid:
+                batch_qids = [record.qid for record in batch_records]
+                batch_qids = np.array(batch_qids).astype("int64").reshape(
+                    [-1, 1])
+            else:
+                batch_qids = np.array([]).astype("int64").reshape([-1, 1])
+        # padding
+        padded_token_ids, input_mask = pad_batch_data(
+            batch_token_ids, pad_idx=self.pad_id, return_input_mask=True)
+        padded_text_type_ids = pad_batch_data(
+            batch_text_type_ids, pad_idx=self.pad_id)
+        padded_position_ids = pad_batch_data(
+            batch_position_ids, pad_idx=self.pad_id)
+        padded_task_ids = np.ones_like(
+            padded_token_ids, dtype="int64") * self.task_id
+        return_list = [
+            padded_token_ids, padded_text_type_ids, padded_position_ids,
+            padded_task_ids, input_mask
+        ]
+        if not self.is_inference:
+            return_list += [batch_labels, batch_qids]
+        return return_list
+class SequenceLabelReader(BaseReader):
+    def _pad_batch_records(self, batch_records):
+        batch_token_ids = [record.token_ids for record in batch_records]
+        batch_text_type_ids = [record.text_type_ids for record in batch_records]
+        batch_position_ids = [record.position_ids for record in batch_records]
+        batch_label_ids = [record.label_ids for record in batch_records]
+        # padding
+        padded_token_ids, input_mask, batch_seq_lens = pad_batch_data(
+            batch_token_ids,
+            pad_idx=self.pad_id,
+            return_input_mask=True,
+            return_seq_lens=True)
+        padded_text_type_ids = pad_batch_data(
+            batch_text_type_ids, pad_idx=self.pad_id)
+        padded_position_ids = pad_batch_data(
+            batch_position_ids, pad_idx=self.pad_id)
+        padded_label_ids = pad_batch_data(
+            batch_label_ids, pad_idx=len(self.label_map) - 1)
+        padded_task_ids = np.ones_like(
+            padded_token_ids, dtype="int64") * self.task_id
+        return_list = [
+            padded_token_ids, padded_text_type_ids, padded_position_ids,
+            padded_task_ids, input_mask, padded_label_ids, batch_seq_lens
+        ]
+        return return_list
+    def _reseg_token_label(self, tokens, labels, tokenizer):
+        assert len(tokens) == len(labels)
+        ret_tokens = []
+        ret_labels = []
+        for token, label in zip(tokens, labels):
+            sub_token = tokenizer.tokenize(token)
+            if len(sub_token) == 0:
+                continue
+            ret_tokens.extend(sub_token)
+            if len(sub_token) == 1:
+                ret_labels.append(label)
+                continue
+            if label == "O" or label.startswith("I-"):
+                ret_labels.extend([label] * len(sub_token))
+            elif label.startswith("B-"):
+                i_label = "I-" + label[2:]
+                ret_labels.extend([label] + [i_label] * (len(sub_token) - 1))
+            elif label.startswith("S-"):
+                b_laebl = "B-" + label[2:]
+                e_label = "E-" + label[2:]
+                i_label = "I-" + label[2:]
+                ret_labels.extend([b_laebl] + [i_label] * (len(sub_token) - 2) + [e_label])
+            elif label.startswith("E-"):
+                i_label = "I-" + label[2:]
+                ret_labels.extend([i_label] * (len(sub_token) - 1) + [label])
+        assert len(ret_tokens) == len(ret_labels)
+        return ret_tokens, ret_labels
+    def _convert_example_to_record(self, example, max_seq_length, tokenizer):
+        tokens = tokenization.convert_to_unicode(example.text_a).split(u"")
+        labels = tokenization.convert_to_unicode(example.label).split(u"")
+        tokens, labels = self._reseg_token_label(tokens, labels, tokenizer)
+        if len(tokens) > max_seq_length - 2:
+            tokens = tokens[0:(max_seq_length - 2)]
+            labels = labels[0:(max_seq_length - 2)]
+        tokens = ["[CLS]"] + tokens + ["[SEP]"]
+        token_ids = tokenizer.convert_tokens_to_ids(tokens)
+        position_ids = list(range(len(token_ids)))
+        text_type_ids = [0] * len(token_ids)
+        no_entity_id = len(self.label_map) - 1
+        label_ids = [no_entity_id] + [
+            self.label_map[label] for label in labels
+        ] + [no_entity_id]
+        Record = namedtuple(
+            'Record',
+            ['token_ids', 'text_type_ids', 'position_ids', 'label_ids'])
+        record = Record(
+            token_ids=token_ids,
+            text_type_ids=text_type_ids,
+            position_ids=position_ids,
+            label_ids=label_ids)
+        return record
+class ExtractEmbeddingReader(BaseReader):
+    def _pad_batch_records(self, batch_records):
+        batch_token_ids = [record.token_ids for record in batch_records]
+        batch_text_type_ids = [record.text_type_ids for record in batch_records]
+        batch_position_ids = [record.position_ids for record in batch_records]
+        # padding
+        padded_token_ids, input_mask, seq_lens = pad_batch_data(
+            batch_token_ids,
+            pad_idx=self.pad_id,
+            return_input_mask=True,
+            return_seq_lens=True)
+        padded_text_type_ids = pad_batch_data(
+            batch_text_type_ids, pad_idx=self.pad_id)
+        padded_position_ids = pad_batch_data(
+            batch_position_ids, pad_idx=self.pad_id)
+        padded_task_ids = np.ones_like(
+            padded_token_ids, dtype="int64") * self.task_id
+        return_list = [
+            padded_token_ids, padded_text_type_ids, padded_position_ids,
+            padded_task_ids, input_mask, seq_lens
+        ]
+        return return_list
+class MRCReader(BaseReader):
+    def __init__(self,
+                 vocab_path,
+                 label_map_config=None,
+                 max_seq_len=512,
+                 do_lower_case=True,
+                 in_tokens=False,
+                 random_seed=None,
+                 tokenizer="FullTokenizer",
+                 is_classify=True,
+                 is_regression=False,
+                 for_cn=True,
+                 task_id=0,
+                 doc_stride=128,
+                 max_query_length=64):
+        self.max_seq_len = max_seq_len
+        self.tokenizer = tokenization.FullTokenizer(
+            vocab_file=vocab_path, do_lower_case=do_lower_case)
+        self.vocab = self.tokenizer.vocab
+        self.pad_id = self.vocab["[PAD]"]
+        self.cls_id = self.vocab["[CLS]"]
+        self.sep_id = self.vocab["[SEP]"]
+        self.in_tokens = in_tokens
+        self.for_cn = for_cn
+        self.task_id = task_id
+        self.doc_stride = doc_stride
+        self.max_query_length = max_query_length
+        self.examples = {}
+        self.features = {}
+        if random_seed is not None:
+            np.random.seed(random_seed)
+        self.current_example = 0
+        self.current_epoch = 0
+        self.num_examples = 0
+        self.Example = namedtuple('Example',
+                ['qas_id', 'question_text', 'doc_tokens', 'orig_answer_text',
+                'start_position', 'end_position'])
+        self.Feature = namedtuple("Feature", ["unique_id", "example_index", "doc_span_index",
+                "tokens", "token_to_orig_map", "token_is_max_context",
+                "token_ids", "position_ids", "text_type_ids",
+                "start_position", "end_position"])
+        self.DocSpan = namedtuple("DocSpan", ["start", "length"])
+    def _read_json(self, input_file, is_training):
+        examples = []
+        with open(input_file, "r", encoding='utf8') as f:
+            input_data = json.load(f)["data"]
+            for entry in input_data:
+                for paragraph in entry["paragraphs"]:
+                    paragraph_text = paragraph["context"]
+                    for qa in paragraph["qas"]:
+                        qas_id = qa["id"]
+                        question_text = qa["question"]
+                        start_pos = None
+                        end_pos = None
+                        orig_answer_text = None
+                        if is_training:
+                            if len(qa["answers"]) != 1:
+                                raise ValueError(
+                                    "For training, each question should have exactly 1 answer."
+                                )
+                            answer = qa["answers"][0]
+                            orig_answer_text = answer["text"]
+                            answer_offset = answer["answer_start"]
+                            answer_length = len(orig_answer_text)
+                            doc_tokens = [
+                                paragraph_text[:answer_offset],
+                                paragraph_text[answer_offset:answer_offset +
+                                               answer_length],
+                                paragraph_text[answer_offset + answer_length:]
+                            ]
+                            start_pos = 1
+                            end_pos = 1
+                            actual_text = " ".join(doc_tokens[start_pos:(end_pos
+                                                                         + 1)])
+                            if actual_text.find(orig_answer_text) == -1:
+                                log.info("Could not find answer: '%s' vs. '%s'",
+                                      actual_text, orig_answer_text)
+                                continue
+                        else:
+                            doc_tokens = tokenization.tokenize_chinese_chars(
+                                paragraph_text)
+                        example = self.Example(
+                            qas_id=qas_id,
+                            question_text=question_text,
+                            doc_tokens=doc_tokens,
+                            orig_answer_text=orig_answer_text,
+                            start_position=start_pos,
+                            end_position=end_pos)
+                        examples.append(example)
+        return examples
+    def _improve_answer_span(self, doc_tokens, input_start, input_end,
+                             tokenizer, orig_answer_text):
+        tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
+        for new_start in range(input_start, input_end + 1):
+            for new_end in range(input_end, new_start - 1, -1):
+                text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
+                if text_span == tok_answer_text:
+                    return (new_start, new_end)
+        return (input_start, input_end)
+    def _check_is_max_context(self, doc_spans, cur_span_index, position):
+        best_score = None
+        best_span_index = None
+        for (span_index, doc_span) in enumerate(doc_spans):
+            end = doc_span.start + doc_span.length - 1
+            if position < doc_span.start:
+                continue
+            if position > end:
+                continue
+            num_left_context = position - doc_span.start
+            num_right_context = end - position
+            score = min(num_left_context,
+                        num_right_context) + 0.01 * doc_span.length
+            if best_score is None or score > best_score:
+                best_score = score
+                best_span_index = span_index
+        return cur_span_index == best_span_index
+    def _convert_example_to_feature(self, examples, max_seq_length, tokenizer,
+                                    is_training):
+        features = []
+        unique_id = 1000000000
+        for (example_index, example) in enumerate(examples):
+            query_tokens = tokenizer.tokenize(example.question_text)
+            if len(query_tokens) > self.max_query_length:
+                query_tokens = query_tokens[0:self.max_query_length]
+            tok_to_orig_index = []
+            orig_to_tok_index = []
+            all_doc_tokens = []
+            for (i, token) in enumerate(example.doc_tokens):
+                orig_to_tok_index.append(len(all_doc_tokens))
+                sub_tokens = tokenizer.tokenize(token)
+                for sub_token in sub_tokens:
+                    tok_to_orig_index.append(i)
+                    all_doc_tokens.append(sub_token)
+            tok_start_position = None
+            tok_end_position = None
+            if is_training:
+                tok_start_position = orig_to_tok_index[example.start_position]
+                if example.end_position < len(example.doc_tokens) - 1:
+                    tok_end_position = orig_to_tok_index[example.end_position +
+                                                         1] - 1
+                else:
+                    tok_end_position = len(all_doc_tokens) - 1
+                (tok_start_position,
+                 tok_end_position) = self._improve_answer_span(
+                     all_doc_tokens, tok_start_position, tok_end_position,
+                     tokenizer, example.orig_answer_text)
+            max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
+            doc_spans = []
+            start_offset = 0
+            while start_offset < len(all_doc_tokens):
+                length = len(all_doc_tokens) - start_offset
+                if length > max_tokens_for_doc:
+                    length = max_tokens_for_doc
+                doc_spans.append(self.DocSpan(start=start_offset, length=length))
+                if start_offset + length == len(all_doc_tokens):
+                    break
+                start_offset += min(length, self.doc_stride)
+            for (doc_span_index, doc_span) in enumerate(doc_spans):
+                tokens = []
+                token_to_orig_map = {}
+                token_is_max_context = {}
+                text_type_ids = []
+                tokens.append("[CLS]")
+                text_type_ids.append(0)
+                for token in query_tokens:
+                    tokens.append(token)
+                    text_type_ids.append(0)
+                tokens.append("[SEP]")
+                text_type_ids.append(0)
+                for i in range(doc_span.length):
+                    split_token_index = doc_span.start + i
+                    token_to_orig_map[len(tokens)] = tok_to_orig_index[
+                        split_token_index]
+                    is_max_context = self._check_is_max_context(
+                        doc_spans, doc_span_index, split_token_index)
+                    token_is_max_context[len(tokens)] = is_max_context
+                    tokens.append(all_doc_tokens[split_token_index])
+                    text_type_ids.append(1)
+                tokens.append("[SEP]")
+                text_type_ids.append(1)
+                token_ids = tokenizer.convert_tokens_to_ids(tokens)
+                position_ids = list(range(len(token_ids)))
+                start_position = None
+                end_position = None
+                if is_training:
+                    doc_start = doc_span.start
+                    doc_end = doc_span.start + doc_span.length - 1
+                    out_of_span = False
+                    if not (tok_start_position >= doc_start and
+                            tok_end_position <= doc_end):
+                        out_of_span = True
+                    if out_of_span:
+                        start_position = 0
+                        end_position = 0
+                    else:
+                        doc_offset = len(query_tokens) + 2
+                        start_position = tok_start_position - doc_start + doc_offset
+                        end_position = tok_end_position - doc_start + doc_offset
+                feature = self.Feature(
+                    unique_id=unique_id,
+                    example_index=example_index,
+                    doc_span_index=doc_span_index,
+                    tokens=tokens,
+                    token_to_orig_map=token_to_orig_map,
+                    token_is_max_context=token_is_max_context,
+                    token_ids=token_ids,
+                    position_ids=position_ids,
+                    text_type_ids=text_type_ids,
+                    start_position=start_position,
+                    end_position=end_position)
+                features.append(feature)
+                unique_id += 1
+        return features
+    def _prepare_batch_data(self, records, batch_size, phase=None):
+        """generate batch records"""
+        batch_records, max_len = [], 0
+        for index, record in enumerate(records):
+            if phase == "train":
+                self.current_example = index
+            max_len = max(max_len, len(record.token_ids))
+            if self.in_tokens:
+                to_append = (len(batch_records) + 1) * max_len <= batch_size
+            else:
+                to_append = len(batch_records) < batch_size
+            if to_append:
+                batch_records.append(record)
+            else:
+                yield self._pad_batch_records(batch_records, phase == "train")
+                batch_records, max_len = [record], len(record.token_ids)
+        if phase == 'pred' and batch_records:
+            yield self._pad_batch_records(batch_records, phase == "train")
+    def _pad_batch_records(self, batch_records, is_training):
+        batch_token_ids = [record.token_ids for record in batch_records]
+        batch_text_type_ids = [record.text_type_ids for record in batch_records]
+        batch_position_ids = [record.position_ids for record in batch_records]
+        if is_training:
+            batch_start_position = [
+                record.start_position for record in batch_records
+            ]
+            batch_end_position = [
+                record.end_position for record in batch_records
+            ]
+            batch_start_position = np.array(batch_start_position).astype(
+                "int64").reshape([-1, 1])
+            batch_end_position = np.array(batch_end_position).astype(
+                "int64").reshape([-1, 1])
+        else:
+            batch_size = len(batch_token_ids)
+            batch_start_position = np.zeros(
+                shape=[batch_size, 1], dtype="int64")
+            batch_end_position = np.zeros(shape=[batch_size, 1], dtype="int64")
+        batch_unique_ids = [record.unique_id for record in batch_records]
+        batch_unique_ids = np.array(batch_unique_ids).astype("int64").reshape(
+            [-1, 1])
+        # padding
+        padded_token_ids, input_mask = pad_batch_data(
+            batch_token_ids, pad_idx=self.pad_id, return_input_mask=True)
+        padded_text_type_ids = pad_batch_data(
+            batch_text_type_ids, pad_idx=self.pad_id)
+        padded_position_ids = pad_batch_data(
+            batch_position_ids, pad_idx=self.pad_id)
+        padded_task_ids = np.ones_like(
+            padded_token_ids, dtype="int64") * self.task_id
+        return_list = [
+            padded_token_ids, padded_text_type_ids, padded_position_ids,
+            padded_task_ids, input_mask, batch_start_position,
+            batch_end_position, batch_unique_ids
+        ]
+        return return_list
+    def get_num_examples(self, phase):
+        return len(self.features[phase])
+    def get_features(self, phase):
+        return self.features[phase]
+    def get_examples(self, phase):
+        return self.examples[phase]
+    def data_generator(self,
+                       input_file,
+                       batch_size,
+                       epoch,
+                       dev_count=1,
+                       shuffle=True,
+                       phase=None):
+        examples = self.examples.get(phase, None)
+        features = self.features.get(phase, None)
+        if not examples:
+            examples = self._read_json(input_file, phase == "train")
+            features = self._convert_example_to_feature(
+                examples, self.max_seq_len, self.tokenizer, phase == "train")
+            self.examples[phase] = examples
+            self.features[phase] = features
+        def wrapper():
+            all_dev_batches = []
+            if epoch is None:
+                num_epochs = 99999999
+            else:
+                num_epochs = epoch
+            for epoch_index in range(num_epochs):
+                if phase == "train":
+                    self.current_example = 0
+                    self.current_epoch = epoch_index
+                if phase == "train" and shuffle:
+                    np.random.shuffle(features)
+                for batch_data in self._prepare_batch_data(
+                        features, batch_size, phase=phase):
+                    if len(all_dev_batches) < dev_count:
+                        all_dev_batches.append(batch_data)
+                    if len(all_dev_batches) == dev_count:
+                        for batch in all_dev_batches:
+                            yield batch
+                        all_dev_batches = []
+        return wrapper
+if __name__ == '__main__':
+    pass
--- a/paddlepalm/task_instance.py
+++ b/paddlepalm/task_instance.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddlepalm.interface import reader as base_reader
+from paddlepalm.interface import task_paradigm as base_paradigm
+import os
+import json
+from paddle import fluid
+class TaskInstance(object):
+    def __init__(self, name, id, config={}, verbose=True):
+        self._name = name
+        self._config = config
+        self._verbose = verbose
+        self._save_infermodel_path = os.path.join(self._config['save_path'], 'infer_model')
+        self._save_ckpt_path = os.path.join(self._config['save_path'], 'ckpt')
+        # following flags can be fetch from instance config file
+        self._is_target = config.get('is_target', True)
+        self._first_target = config.get('is_first_target', False)
+        self._task_reuse_scope = config.get('task_reuse_scope', name)
+        self._feeded_var_names = None
+        self._target_vars = None
+        # training process management
+        self._mix_ratio = None
+        self._expected_train_steps = None
+        self._expected_train_epochs = None
+        self._steps_pur_epoch = None
+        self._cur_train_epoch = 0
+        self._cur_train_step = 0
+        self._train_finish = False
+        # 存放不同运行阶段（train，eval，pred）的数据集reader，key为phase，value为Reader实例
+        self._reader = {'train': None, 'eval': None, 'pred': None}
+        self._input_layer = None
+        self._inputname_to_varname = {}
+        self._task_layer = {'train': None, 'eval': None, 'pred': None}
+        self._pred_input_name_list = []
+        self._pred_input_varname_list = []
+        self._pred_fetch_name_list = []
+        self._pred_fetch_var_list = []
+        self._Reader = None
+        self._Paradigm = None
+        self._exe = fluid.Executor(fluid.CPUPlace())
+        self._save_protocol = {
+            'input_names': 'self._pred_input_name_list',
+            'input_varnames': 'self._pred_input_varname_list',
+            'fetch_list': 'self._pred_fetch_name_list'}
+    def build_task_layer(self, net_inputs, phase):
+        output_vars = self._task_layer[phase].build(net_inputs)
+        if phase == 'pred':
+            self._pred_fetch_name_list, self._pred_fetch_var_list = zip(*output_vars.items())
+        return output_vars
+    def postprocess(self, rt_outputs, phase):
+        return self._task_layer[phase].postprocess(rt_outputs)
+    def epoch_postprocess(self, epoch_inputs, phase):
+        return self._task_layer[phase].epoch_postprocess(epoch_inputs)
+    def save(self, suffix=''):
+        dirpath = self._save_infermodel_path + suffix
+        self._pred_input_varname_list = [str(i) for i in self._pred_input_varname_list]
+        fluid.io.save_inference_model(dirpath, self._pred_input_varname_list, self._pred_fetch_var_list, self._exe)
+        # fluid.io.save_inference_model(dirpath, self._pred_input_varname_list, self._pred_fetch_var_list, self._exe, params_filename='__params__')
+        print(self._name + ': inference model saved at ' + dirpath)
+        conf = {}
+        for k, strv in self._save_protocol.items():
+            exec('v={}'.format(strv))
+            conf[k] = v
+        with open(os.path.join(dirpath, '__conf__'), 'w') as writer:
+            writer.write(json.dumps(conf, indent=1))
+    def load(self, infer_model_path=None):
+        if infer_model_path is None:
+            infer_model_path = self._save_infermodel_path
+        for k,v in json.load(open(os.path.join(infer_model_path, '__conf__'))).items():
+            strv = self._save_protocol[k]
+            exec('{}=v'.format(strv))
+        pred_prog, self._pred_input_varname_list, self._pred_fetch_var_list = \
+            fluid.io.load_inference_model(infer_model_path, self._exe)
+        # pred_prog, self._pred_input_varname_list, self._pred_fetch_var_list = \
+        #     fluid.io.load_inference_model(infer_model_path, self._exe, params_filename='__params__')
+        print(self._name+': inference model loaded from ' + infer_model_path)
+        return pred_prog
+    @property
+    def name(self):
+        return self._name
+    @property
+    def Reader(self):
+        return self._Reader
+    @Reader.setter
+    def Reader(self, cls):
+        assert base_reader.__name__ == cls.__bases__[-1].__name__, \
+            "expect: {}, receive: {}.".format(base_reader.__name__, \
+                                              cls.__bases__[-1].__name__)
+        self._Reader = cls
+    @property
+    def Paradigm(self):
+        return self._Paradigm
+    @Paradigm.setter
+    def Paradigm(self, cls):
+        assert base_paradigm.__name__ == cls.__bases__[-1].__name__, \
+            "expect: {}, receive: {}.".format(base_paradigm.__name__, \
+                                              cls.__bases__[-1].__name__)
+        self._Paradigm = cls
+    @property
+    def config(self):
+        return self._config
+    @property
+    def reader(self):
+        return self._reader
+    @property
+    def pred_input(self):
+        return zip(*[self._pred_input_name_list, self._pred_input_varname_list])
+    @pred_input.setter
+    def pred_input(self, val):
+        assert isinstance(val, dict)
+        self._pred_input_name_list, self._pred_input_varname_list = \
+            zip(*[[k, v.name] for k,v in val.items()])
+        # print(self._pred_input_name_list)
+    @property
+    def pred_fetch_list(self):
+        return [self._pred_fetch_name_list, self._pred_fetch_var_list]
+    @property
+    def task_layer(self):
+        return self._task_layer
+    @property
+    def is_first_target(self):
+        return self._is_first_target
+    @is_first_target.setter
+    def is_first_target(self, value):
+        self._is_first_target = bool(value)
+        if self._is_first_target:
+            assert self._is_target, "ERROR: only target task could be set as main task."
+        if self._verbose and self._is_first_target:
+            print("{}: set as main task".format(self._name))
+    @property
+    def is_target(self):
+        if self._is_target is not None:
+            return self._is_target
+        else:
+            raise ValueError("{}: is_target is None".format(self._name))
+    @is_target.setter
+    def is_target(self, value):
+        self._is_target = bool(value)
+        if self._verbose:
+            if self._is_target:
+                print('{}: set as target task.'.format(self._name))
+            else:
+                print('{}: set as aux task.'.format(self._name))
+    @property
+    def mix_ratio(self):
+        if self._mix_ratio is not None:
+            return self._mix_ratio
+        else:
+            raise ValueError("{}: mix_ratio is None".format(self._name))
+    @mix_ratio.setter
+    def mix_ratio(self, value):
+        self._mix_ratio = float(value)
+        if self._verbose:
+            print('{}: mix_ratio is set to {}'.format(self._name, self._mix_ratio))
+    @property
+    def expected_train_steps(self):
+        return self._expected_train_steps
+    @expected_train_steps.setter
+    def expected_train_steps(self, value):
+        self._expected_train_steps = value
+        self._expected_train_epochs = value / float(self._steps_pur_epoch)
+    @property
+    def expected_train_epochs(self):
+        return self._expected_train_epochs
+    @property
+    def cur_train_epoch(self):
+        return self._cur_train_epoch
+    @cur_train_epoch.setter
+    def cur_train_epoch(self, value):
+        self._cur_train_epoch = value
+    @property
+    def cur_train_step(self):
+        return self._cur_train_step
+    @cur_train_step.setter
+    def cur_train_step(self, value):
+        self._cur_train_step = value
+        if self._cur_train_step > self._steps_pur_epoch:
+            self._cur_train_epoch += 1
+            self._cur_train_step = 1
+        if self._is_target and self._cur_train_step + self._cur_train_epoch * self._steps_pur_epoch >= self._expected_train_steps:
+            self._train_finish = True
+            print(self._name+': train finished!')
+            self.save()
+            # fluid.io.save_inference_model(self._save_infermodel_path, )
+    @property
+    def steps_pur_epoch(self):
+        return self._steps_pur_epoch
+    @steps_pur_epoch.setter
+    def steps_pur_epoch(self, value):
+        self._steps_pur_epoch = value
+    @property
+    def train_finish(self):
+        return self._train_finish
+    @property
+    def task_reuse_scope(self):
+        if self._task_reuse_scope is not None:
+            return self._task_reuse_scope
+        else:
+            raise ValueError("{}: task_reuse_scope is None".format(self._name))
+    @task_reuse_scope.setter
+    def task_reuse_scope(self, scope_name):
+        self._task_reuse_scope = str(scope_name)
+        if self._verbose:
+            print('{}: task_reuse_scope is set to {}'.format(self._name, self._task_reuse_scope))
+def check_instances(insts):
+    """to check ids, first_target"""
+    pass
+def _check_ids():
+    pass
+def _check_targets():
+    pass
+def _check_reuse_scopes():
+    pass
--- a/paddlepalm/task_paradigm/__init__.py
+++ b/paddlepalm/task_paradigm/__init__.py
--- a/paddlepalm/task_paradigm/cls.py
+++ b/paddlepalm/task_paradigm/cls.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle.fluid as fluid
+from paddlepalm.interface import task_paradigm
+from paddle.fluid import layers
+class TaskParadigm(task_paradigm):
+    '''
+    classification
+    '''
+    def __init___(self, config, phase):
+        self._is_training = phase == 'train'
+        self.sent_emb_size = config['hidden_size']
+        self.num_classes = config['n_classes']
+    @property
+    def inputs_attrs(self):
+        return {'bakcbone': {"sentence_emb": [-1, self.sent_emb_size], 'float32']},
+                'reader': {"label_ids": [[-1, 1], 'int64']}}
+    @property
+    def outputs_attrs(self):
+        if self._is_training:
+            return {'loss': [[1], 'float32']}
+        else:
+            return {'logits': [-1, self.num_classes], 'float32'}
+    def build(self, **inputs):
+        sent_emb = inputs['backbone']['sentence_emb']
+        label_ids = inputs['reader']['label_ids']
+        logits = fluid.layers.fc(
+            input=ent_emb
+            size=self.num_classes,
+            param_attr=fluid.ParamAttr(
+                name="cls_out_w",
+                initializer=fluid.initializer.TruncatedNormal(scale=0.1)),
+            bias_attr=fluid.ParamAttr(
+                name="cls_out_b", initializer=fluid.initializer.Constant(0.)))
+        loss = fluid.layers.softmax_with_cross_entropy(
+            logits=logits, label=label_ids)
+        loss = layers.mean(loss)
+        if self._is_training:
+            return {"loss": loss}
+        else:
+            return {"logits":logits}
--- a/paddlepalm/task_paradigm/match.py
+++ b/paddlepalm/task_paradigm/match.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle.fluid as fluid
+from paddlepalm.interface import task_paradigm
+from paddle.fluid import layers
+class TaskParadigm(task_paradigm):
+    '''
+    matching
+    '''
+    def __init__(self, config, phase, backbone_config=None):
+        self._is_training = phase == 'train'
+        self._hidden_size = backbone_config['hidden_size']
+    @property
+    def inputs_attrs(self):
+        if self._is_training:
+            reader = {"label_ids": [[-1, 1], 'int64']}
+        else:
+            reader = {}
+        bb = {"sentence_pair_embedding": [[-1, self._hidden_size], 'float32']}
+        return {'reader': reader, 'backbone': bb}
+    @property
+    def outputs_attrs(self):
+        if self._is_training:
+            return {"loss": [[1], 'float32']}
+        else:
+            return {"logits": [[-1, 1], 'float32']}
+    def build(self, inputs):
+        labels = inputs["reader"]["label_ids"] 
+        cls_feats = inputs["backbone"]["sentence_pair_embedding"]
+        cls_feats = fluid.layers.dropout(
+            x=cls_feats,
+            dropout_prob=0.1,
+            dropout_implementation="upscale_in_train")
+        logits = fluid.layers.fc(
+            input=cls_feats,
+            size=2,
+            param_attr=fluid.ParamAttr(
+                name="cls_out_w",
+                initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
+            bias_attr=fluid.ParamAttr(
+                name="cls_out_b",
+                initializer=fluid.initializer.Constant(0.)))
+        ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
+            logits=logits, label=labels, return_softmax=True)
+        loss = fluid.layers.mean(x=ce_loss)
+        if self._is_training:
+            return {'loss': loss}
+        else:
+            return {'logits': logits}
--- a/paddlepalm/task_paradigm/mlm.py
+++ b/paddlepalm/task_paradigm/mlm.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle.fluid as fluid
+from paddlepalm.interface import task_paradigm
+from paddle.fluid import layers
+class TaskParadigm(task_paradigm):
+    '''
+    matching
+    '''
+    def __init__(self, config, phase, backbone_config=None):
+        self._is_training = phase == 'train'
+        self._hidden_size = backbone_config['hidden_size']
+        self._vocab_size = backbone_config['vocab_size']
+        self._hidden_act = backbone_config['hidden_act']
+        self._initializer_range = backbone_config['initializer_range']
+    @property
+    def inputs_attrs(self):
+        if self._is_training:
+            reader = {"label_ids": [[-1, 1], 'int64']}
+        else:
+            reader = {}
+        bb = {"encoder_outputs": [[-1, self._hidden_size], 'float32']}
+        return {'reader': reader, 'backbone': bb}
+    @property
+    def outputs_attrs(self):
+        if self._is_training:
+            return {"loss": [[1], 'float32']}
+        else:
+            return {"logits": [[-1, 1], 'float32']}
+    def build(self, inputs):
+        mask_label = inputs["reader"]["mask_label"] 
+        mask_pos = inputs["reader"]["mask_pos"] 
+        word_emb = inputs["backbone"]["word_embedding"]
+        enc_out = inputs["backbone"]["encoder_outputs"]
+        emb_size = word_emb.shape[-1]
+        _param_initializer = fluid.initializer.TruncatedNormal(
+            scale=self._initializer_range)
+        mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')
+        reshaped_emb_out = fluid.layers.reshape(
+            x=enc_out, shape=[-1, emb_size])
+        # extract masked tokens' feature
+        mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
+        num_seqs = fluid.layers.fill_constant(shape=[1], value=512, dtype='int64')
+        # transform: fc
+        mask_trans_feat = fluid.layers.fc(
+            input=mask_feat,
+            size=emb_size,
+            act=self._hidden_act,
+            param_attr=fluid.ParamAttr(
+                name='mask_lm_trans_fc.w_0',
+                initializer=_param_initializer),
+            bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0'))
+        # transform: layer norm
+        mask_trans_feat = pre_process_layer(
+            mask_trans_feat, 'n', name='mask_lm_trans')
+        mask_lm_out_bias_attr = fluid.ParamAttr(
+            name="mask_lm_out_fc.b_0",
+            initializer=fluid.initializer.Constant(value=0.0))
+        # print fluid.default_main_program().global_block()
+        # fc_out = fluid.layers.matmul(
+        #     x=mask_trans_feat,
+        #     y=fluid.default_main_program().global_block().var(
+        #         _word_emb_name),
+        #     transpose_y=True)
+        fc_out = fluid.layers.matmul(
+            x=mask_trans_feat,
+            y=word_emb,
+            transpose_y=True)
+        fc_out += fluid.layers.create_parameter(
+            shape=[self._vocab_size],
+            dtype='float32',
+            attr=mask_lm_out_bias_attr,
+            is_bias=True)
+        mask_lm_loss = fluid.layers.softmax_with_cross_entropy(
+            logits=fc_out, label=mask_label)
+        loss = fluid.layers.mean(mask_lm_loss)
+        if self._is_training:
+            return {'loss': loss}
+        else:
+            return None
--- a/paddlepalm/task_paradigm/mrc.py
+++ b/paddlepalm/task_paradigm/mrc.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle.fluid as fluid
+from paddlepalm.interface import task_paradigm
+import collections
+import numpy as np
+import os
+import math
+import six
+import paddlepalm.tokenizer.ernie_tokenizer as tokenization
+import json
+RawResult = collections.namedtuple("RawResult",
+                                   ["unique_id", "start_logits", "end_logits"])
+class TaskParadigm(task_paradigm):
+    """"""
+    def __init__(self, config, phase, backbone_config=None):
+        self._is_training = phase == 'train'
+        self._max_sequence_length = config['max_seq_len']
+        self._hidden_size = backbone_config['hidden_size']
+        self._pred_results = []
+        if phase == 'pred':
+            self._max_answer_length = config.get('max_answer_len', None)
+            self._null_score_diff_threshold = config.get('null_score_diff_threshold', 0.0)
+            self._n_best_size = config.get('n_best_size', 20)
+            self._pred_output_path = config.get('pred_output_path', None)
+            self._verbose = config.get('verbose', False)
+            self._with_negative = config.get('with_negative', False)
+            self._do_lower_case = config.get('do_lower_case', False)
+    @property
+    def inputs_attrs(self):
+        if self._is_training:
+            reader = {"start_positions": [[-1, 1], 'int64'],
+                      "end_positions": [[-1, 1], 'int64']}
+        else:
+            reader = {'unique_ids': [[-1, 1], 'int64']}
+        bb = {"encoder_outputs": [[-1, -1, self._hidden_size], 'float32']}
+        return {'reader': reader, 'backbone': bb}
+    @property
+    def epoch_inputs_attrs(self):
+        if not self._is_training:
+            from_reader = {'examples': None, 'features': None}
+            return {'reader': from_reader}
+    @property
+    def outputs_attr(self):
+        if self._is_training:
+            return {'start_logits': [[-1, -1, 1], 'float32'],
+                    'end_logits': [[-1, -1, 1], 'float32'],
+                    'loss': [[1], 'float32']}
+        else:
+            return {'start_logits': [[-1, -1, 1], 'float32'],
+                    'end_logits': [[-1, -1, 1], 'float32'],
+                    'unique_ids': [[-1, 1], 'int64']}
+    def build(self, inputs):
+        if self._is_training:
+            start_positions = inputs['reader']['start_positions']
+            end_positions = inputs['reader']['end_positions']
+        else:
+            unique_id = inputs['reader']['unique_ids']
+        enc_out = inputs['backbone']['encoder_outputs']
+        logits = fluid.layers.fc(
+            input=enc_out,
+            size=2,
+            num_flatten_dims=2,
+            param_attr=fluid.ParamAttr(
+                name="cls_squad_out_w",
+                initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
+            bias_attr=fluid.ParamAttr(
+                name="cls_squad_out_b", initializer=fluid.initializer.Constant(0.)))
+        logits = fluid.layers.transpose(x=logits, perm=[2, 0, 1])
+        start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0)
+        def _compute_single_loss(logits, positions):
+            """Compute start/end loss for mrc model"""
+            loss = fluid.layers.softmax_with_cross_entropy(
+                logits=logits, label=positions)
+            loss = fluid.layers.mean(x=loss)
+            return loss
+        if self._is_training:
+            start_loss = _compute_single_loss(start_logits, start_positions)
+            end_loss = _compute_single_loss(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2.0
+            return {'start_logits': start_logits,
+                    'end_logits': end_logits,
+                    'loss': total_loss}
+        else:
+            return {'start_logits': start_logits,
+                    'end_logits': end_logits,
+                    'unique_ids': unique_id}
+    def postprocess(self, rt_outputs):                                                                                                                                                   
+        """this func will be called after each step(batch) of training/evaluating/predicting process."""
+        if not self._is_training:
+            unique_ids = np.squeeze(rt_outputs['unique_ids'], -1)
+            start_logits = rt_outputs['start_logits']
+            end_logits = rt_outputs['end_logits']
+            for idx in range(len(unique_ids)):
+                if unique_ids[idx] < 0:
+                    continue
+                if len(self._pred_results) % 1000 == 0:
+                    print("Predicting example: {}".format(len(self._pred_results)))
+                uid = int(unique_ids[idx])
+                s = [float(x) for x in start_logits[idx].flat]
+                e = [float(x) for x in end_logits[idx].flat]
+                self._pred_results.append(
+                    RawResult(
+                        unique_id=uid,
+                        start_logits=s,
+                        end_logits=e))
+    def epoch_postprocess(self, post_inputs):
+        """(optional interface) this func will be called after evaluation/predicting process and each epoch during training process."""
+        if not self._is_training:
+            if self._pred_output_path is None:
+                raise ValueError('argument pred_output_path not found in config. Please add it into config dict/file.')
+            examples = post_inputs['reader']['examples']
+            features = post_inputs['reader']['features']
+            if not os.path.exists(self._pred_output_path):
+                os.makedirs(self._pred_output_path)
+            output_prediction_file = os.path.join(self._pred_output_path, "predictions.json")
+            output_nbest_file = os.path.join(self._pred_output_path, "nbest_predictions.json")
+            output_null_log_odds_file = os.path.join(self._pred_output_path, "null_odds.json")
+            _write_predictions(examples, features, self._pred_results,
+                              self._n_best_size, self._max_answer_length,
+                              self._do_lower_case, output_prediction_file,
+                              output_nbest_file, output_null_log_odds_file,
+                              self._with_negative,
+                              self._null_score_diff_threshold, self._verbose)
+def _write_predictions(all_examples, all_features, all_results, n_best_size,
+                      max_answer_length, do_lower_case, output_prediction_file,
+                      output_nbest_file, output_null_log_odds_file,
+                      with_negative, null_score_diff_threshold,
+                      verbose):
+    """Write final predictions to the json file and log-odds of null if needed."""
+    print("Writing predictions to: %s" % (output_prediction_file))
+    print("Writing nbest to: %s" % (output_nbest_file))
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction", [
+            "feature_index", "start_index", "end_index", "start_logit",
+            "end_logit"
+        ])
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+    for (example_index, example) in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        score_null = 1000000  # large and positive
+        min_null_feature_index = 0  # the paragraph slice with min mull score
+        null_start_logit = 0  # the start logit at the slice with min null score
+        null_end_logit = 0  # the end logit at the slice with min null score
+        for (feature_index, feature) in enumerate(features):
+            result = unique_id_to_result[feature.unique_id]
+            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
+            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
+            # if we could have irrelevant answers, get the min score of irrelevant
+            if with_negative:
+                feature_null_score = result.start_logits[0] + result.end_logits[
+                    0]
+                if feature_null_score < score_null:
+                    score_null = feature_null_score
+                    min_null_feature_index = feature_index
+                    null_start_logit = result.start_logits[0]
+                    null_end_logit = result.end_logits[0]
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= len(feature.tokens):
+                        continue
+                    if end_index >= len(feature.tokens):
+                        continue
+                    if start_index not in feature.token_to_orig_map:
+                        continue
+                    if end_index not in feature.token_to_orig_map:
+                        continue
+                    if not feature.token_is_max_context.get(start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_logit=result.start_logits[start_index],
+                            end_logit=result.end_logits[end_index]))
+        if with_negative:
+            prelim_predictions.append(
+                _PrelimPrediction(
+                    feature_index=min_null_feature_index,
+                    start_index=0,
+                    end_index=0,
+                    start_logit=null_start_logit,
+                    end_logit=null_end_logit))
+        prelim_predictions = sorted(
+            prelim_predictions,
+            key=lambda x: (x.start_logit + x.end_logit),
+            reverse=True)
+        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+            "NbestPrediction", ["text", "start_logit", "end_logit"])
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+            if pred.start_index > 0:  # this is a non-null prediction
+                tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1
+                                                              )]
+                orig_doc_start = feature.token_to_orig_map[pred.start_index]
+                orig_doc_end = feature.token_to_orig_map[pred.end_index]
+                orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end +
+                                                                 1)]
+                tok_text = " ".join(tok_tokens)
+                # De-tokenize WordPieces that have been split off.
+                tok_text = tok_text.replace(" ##", "")
+                tok_text = tok_text.replace("##", "")
+                # Clean whitespace
+                tok_text = tok_text.strip()
+                tok_text = " ".join(tok_text.split())
+                orig_text = " ".join(orig_tokens)
+                final_text = _get_final_text(tok_text, orig_text, do_lower_case,
+                                            verbose)
+                if final_text in seen_predictions:
+                    continue
+                seen_predictions[final_text] = True
+            else:
+                final_text = ""
+                seen_predictions[final_text] = True
+            nbest.append(
+                _NbestPrediction(
+                    text=final_text,
+                    start_logit=pred.start_logit,
+                    end_logit=pred.end_logit))
+        # if we didn't inlude the empty option in the n-best, inlcude it
+        if with_negative:
+            if "" not in seen_predictions:
+                nbest.append(
+                    _NbestPrediction(
+                        text="",
+                        start_logit=null_start_logit,
+                        end_logit=null_end_logit))
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(
+                _NbestPrediction(
+                    text="empty", start_logit=0.0, end_logit=0.0))
+        assert len(nbest) >= 1
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_logit + entry.end_logit)
+            if not best_non_null_entry:
+                if entry.text:
+                    best_non_null_entry = entry
+        # debug
+        if best_non_null_entry is None:
+            print("Emmm..., sth wrong")
+        probs = _compute_softmax(total_scores)
+        nbest_json = []
+        for (i, entry) in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_logit"] = entry.start_logit
+            output["end_logit"] = entry.end_logit
+            nbest_json.append(output)
+        assert len(nbest_json) >= 1
+        if not with_negative:
+            all_predictions[example.qas_id] = nbest_json[0]["text"]
+        else:
+            # predict "" iff the null score - the score of best non-null > threshold
+            score_diff = score_null - best_non_null_entry.start_logit - (
+                best_non_null_entry.end_logit)
+            scores_diff_json[example.qas_id] = score_diff
+            if score_diff > null_score_diff_threshold:
+                all_predictions[example.qas_id] = ""
+            else:
+                all_predictions[example.qas_id] = best_non_null_entry.text
+        all_nbest_json[example.qas_id] = nbest_json
+    with open(output_prediction_file, "w") as writer:
+        writer.write(json.dumps(all_predictions, indent=4) + "\n")
+    with open(output_nbest_file, "w") as writer:
+        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+    if with_negative:
+        with open(output_null_log_odds_file, "w") as writer:
+            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+def _get_final_text(pred_text, orig_text, do_lower_case, verbose):
+    """Project the tokenized prediction back to the original text."""
+    # When we created the data, we kept track of the alignment between original
+    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
+    # now `orig_text` contains the span of our original text corresponding to the
+    # span that we predicted.
+    #
+    # However, `orig_text` may contain extra characters that we don't want in
+    # our prediction.
+    #
+    # For example, let's say:
+    #   pred_text = steve smith
+    #   orig_text = Steve Smith's
+    #
+    # We don't want to return `orig_text` because it contains the extra "'s".
+    #
+    # We don't want to return `pred_text` because it's already been normalized
+    # (the MRQA eval script also does punctuation stripping/lower casing but
+    # our tokenizer does additional normalization like stripping accent
+    # characters).
+    #
+    # What we really want to return is "Steve Smith".
+    #
+    # Therefore, we have to apply a semi-complicated alignment heruistic between
+    # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
+    # can fail in certain cases in which case we just return `orig_text`.
+    def _strip_spaces(text):
+        ns_chars = []
+        ns_to_s_map = collections.OrderedDict()
+        for (i, c) in enumerate(text):
+            if c == " ":
+                continue
+            ns_to_s_map[len(ns_chars)] = i
+            ns_chars.append(c)
+        ns_text = "".join(ns_chars)
+        return (ns_text, ns_to_s_map)
+    # We first tokenize `orig_text`, strip whitespace from the result
+    # and `pred_text`, and check if they are the same length. If they are
+    # NOT the same length, the heuristic has failed. If they are the same
+    # length, we assume the characters are one-to-one aligned.
+    tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
+    tok_text = " ".join(tokenizer.tokenize(orig_text))
+    start_position = tok_text.find(pred_text)
+    if start_position == -1:
+        if verbose:
+            print("Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
+        return orig_text
+    end_position = start_position + len(pred_text) - 1
+    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+    if len(orig_ns_text) != len(tok_ns_text):
+        if verbose:
+            print("Length not equal after stripping spaces: '%s' vs '%s'",
+                  orig_ns_text, tok_ns_text)
+        return orig_text
+    # We then project the characters in `pred_text` back to `orig_text` using
+    # the character-to-character alignment.
+    tok_s_to_ns_map = {}
+    for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
+        tok_s_to_ns_map[tok_index] = i
+    orig_start_position = None
+    if start_position in tok_s_to_ns_map:
+        ns_start_position = tok_s_to_ns_map[start_position]
+        if ns_start_position in orig_ns_to_s_map:
+            orig_start_position = orig_ns_to_s_map[ns_start_position]
+    if orig_start_position is None:
+        if verbose:
+            print("Couldn't map start position")
+        return orig_text
+    orig_end_position = None
+    if end_position in tok_s_to_ns_map:
+        ns_end_position = tok_s_to_ns_map[end_position]
+        if ns_end_position in orig_ns_to_s_map:
+            orig_end_position = orig_ns_to_s_map[ns_end_position]
+    if orig_end_position is None:
+        if verbose:
+            print("Couldn't map end position")
+        return orig_text
+    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+    return output_text
+def _get_best_indexes(logits, n_best_size):
+    """Get the n-best logits from a list."""
+    index_and_score = sorted(
+        enumerate(logits), key=lambda x: x[1], reverse=True)
+    best_indexes = []
+    for i in range(len(index_and_score)):
+        if i >= n_best_size:
+            break
+        best_indexes.append(index_and_score[i][0])
+    return best_indexes
+def _compute_softmax(scores):
+    """Compute softmax probability over raw logits."""
+    if not scores:
+        return []
+    max_score = None
+    for score in scores:
+        if max_score is None or score > max_score:
+            max_score = score
+    exp_scores = []
+    total_sum = 0.0
+    for score in scores:
+        x = math.exp(score - max_score)
+        exp_scores.append(x)
+        total_sum += x
+    probs = []
+    for score in exp_scores:
+        probs.append(score / total_sum)
+    return probs
--- a/paddlepalm/tokenizer/__init__.py
+++ b/paddlepalm/tokenizer/__init__.py
--- a/utils/tokenization.py
+++ b/utils/tokenization.py
+# -*- coding: UTF-8 -*-
 #   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,7 +21,7 @@ from __future__ import print_function
 import collections
 import unicodedata
 import six
-import io
 def convert_to_unicode(text):
    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
@@ -68,15 +69,15 @@ def printable_text(text):
 def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()
-    with io.open(vocab_file, encoding="utf8") as fin:
+    fin = open(vocab_file)
-        for num, line in enumerate(fin):
+    for num, line in enumerate(fin):
-            items = convert_to_unicode(line.strip()).split("\t")
+        items = convert_to_unicode(line.strip()).split("\t")
-            if len(items) > 2:
+        if len(items) > 2:
-                break
+            break
-            token = items[0]
+        token = items[0]
-            index = items[1] if len(items) == 2 else num
+        index = items[1] if len(items) == 2 else num
-            token = token.strip()
+        token = token.strip()
-            vocab[token] = int(index)
+        vocab[token] = int(index)
    return vocab

--- a/paddlepalm/tokenizer/ernie_tokenizer.py
+++ b/paddlepalm/tokenizer/ernie_tokenizer.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import absolute_import
+from io import open
+import collections
+import unicodedata
+import six
+def convert_to_unicode(text):
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text.decode("utf-8", "ignore")
+        elif isinstance(text, unicode):
+            return text
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+def printable_text(text):
+    """Returns text encoded in a way suitable for print or `tf.logging`."""
+    # These functions want `str` for both Python2 and Python3, but in one case
+    # it's a Unicode string and in the other it's a byte string.
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, unicode):
+            return text.encode("utf-8")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, encoding='utf8') as fin:
+        for num, line in enumerate(fin):
+            items = convert_to_unicode(line.strip()).split("\t")
+            if len(items) > 2:
+                break
+            token = items[0]
+            index = items[1] if len(items) == 2 else num
+            token = token.strip()
+            vocab[token] = int(index)
+    return vocab
+def convert_by_vocab(vocab, items):
+    """Converts a sequence of [tokens|ids] using the vocab."""
+    output = []
+    for item in items:
+        output.append(vocab[item])
+    return output
+def convert_tokens_to_ids(vocab, tokens):
+    return convert_by_vocab(vocab, tokens)
+def convert_ids_to_tokens(inv_vocab, ids):
+    return convert_by_vocab(inv_vocab, ids)
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a peice of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+class FullTokenizer(object):
+    """Runs end-to-end tokenziation."""
+    def __init__(self, vocab_file, do_lower_case=True):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+    def tokenize(self, text):
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+        return split_tokens
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+class CharTokenizer(object):
+    """Runs end-to-end tokenziation."""
+    def __init__(self, vocab_file, do_lower_case=True):
+        self.vocab = load_vocab(vocab_file)
+        self.inv_vocab = {v: k for k, v in self.vocab.items()}
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+    def tokenize(self, text):
+        split_tokens = []
+        for token in text.lower().split(" "):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+        return split_tokens
+    def convert_tokens_to_ids(self, tokens):
+        return convert_by_vocab(self.vocab, tokens)
+    def convert_ids_to_tokens(self, ids):
+        return convert_by_vocab(self.inv_vocab, ids)
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+    def __init__(self, do_lower_case=True):
+        """Constructs a BasicTokenizer.
+        Args:
+            do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = convert_to_unicode(text)
+        text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+        return ["".join(x) for x in output]
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #     https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+            (cp >= 0x3400 and cp <= 0x4DBF) or  #
+            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+            (cp >= 0x2B820 and cp <= 0x2CEAF) or
+            (cp >= 0xF900 and cp <= 0xFAFF) or  #
+            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+        return False
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenziation."""
+    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+        For example:
+            input = "unaffable"
+            output = ["un", "##aff", "##able"]
+        Args:
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through `BasicTokenizer.
+        Returns:
+            A list of wordpiece tokens.
+        """
+        text = convert_to_unicode(text)
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+        (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
+def tokenize_chinese_chars(text):
+    """Adds whitespace around any CJK character."""
+    def _is_chinese_char(cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #     https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+            (cp >= 0x3400 and cp <= 0x4DBF) or  #
+            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+            (cp >= 0x2B820 and cp <= 0x2CEAF) or
+            (cp >= 0xF900 and cp <= 0xFAFF) or  #
+            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+        return False
+    def _is_whitespace(c):
+        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+            return True
+        return False
+    output = []
+    buff = ""
+    for char in text:
+        cp = ord(char)
+        if _is_chinese_char(cp) or _is_whitespace(char):
+            if buff != "":
+                output.append(buff)
+                buff = ""
+            output.append(char)
+        else:
+            buff += char
+    if buff != "":
+        output.append(buff)
+    return output
--- a/paddlepalm/utils/__init__.py
+++ b/paddlepalm/utils/__init__.py
--- a/utils/configure.py
+++ b/utils/configure.py
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,7 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# -*- coding: utf-8 -*-
 from __future__ import absolute_import
 from __future__ import division
@@ -19,7 +19,6 @@ from __future__ import print_function
 import os
 import sys
-import io
 import argparse
 import json
 import yaml
@@ -40,7 +39,7 @@ class JsonConfig(object):
    def _parse(self, config_path):
        try:
-            with io.open(config_path, encoding="utf8") as json_file:
+            with open(config_path) as json_file:
                config_dict = json.load(json_file)
                assert isinstance(config_dict, dict), "Object in {} is NOT a dict.".format(config_path)
        except:
@@ -165,17 +164,15 @@ class PDConfig(object):
    Can jointly work with command-line-arugment, json files and yaml files.
    """
-    def __init__(self, json_file="", yaml_file=[], fuse_args=True):
+    def __init__(self, json_file=None, yaml_file=None, fuse_args=True):
        """
            Init funciton for PDConfig.
            json_file: the path to the json configure file.
            yaml_file: the path to the yaml configure file.
            fuse_args: if fuse the json/yaml configs with argparse.
        """
-        assert isinstance(json_file, str)
-        assert isinstance(yaml_file, list)
-        if json_file != "" and yaml_file != []:
+        if json_file is not None and yaml_file is not None:
            raise Warning(
                "json_file and yaml_file can not co-exist for now. please only use one configure file type."
            )
@@ -188,28 +185,18 @@ class PDConfig(object):
        parser = argparse.ArgumentParser()
-        self.default_g = ArgumentGroup(parser, "default", "default options.")
        self.yaml_g = ArgumentGroup(parser, "yaml", "options from yaml.")
        self.json_g = ArgumentGroup(parser, "json", "options from json.")
        self.com_g = ArgumentGroup(parser, "custom", "customized options.")
-        """
-        self.default_g.add_arg("epoch", int, 2,
-                               "Number of epoches for training.")
-        self.default_g.add_arg("do_train", bool, False,
-                               "Whether to perform training.")
-        self.default_g.add_arg("do_predict", bool, False,
-                               "Whether to perform predicting.")
-        self.default_g.add_arg("do_eval", bool, False,
-                               "Whether to perform evaluating.")
-        """
        self.parser = parser
-        if json_file != "":
+        if json_file is not None:
+            assert isinstance(json_file, str)
            self.load_json(json_file, fuse_args=fuse_args)
-        if yaml_file:
+        if yaml_file is not None:
+            assert isinstance(yaml_file, str) or isinstance(yaml_file, list)
            self.load_yaml(yaml_file, fuse_args=fuse_args)
    def load_json(self, file_path, fuse_args=True):
@@ -218,7 +205,7 @@ class PDConfig(object):
            raise Warning("the json file %s does not exist." % file_path)
            return
-        with io.open(file_path, "r", encoding="utf8") as fin:
+        with open(file_path, "r") as fin:
            self.json_config = json.loads(fin.read())
            fin.close()
@@ -238,15 +225,15 @@ class PDConfig(object):
    def load_yaml(self, file_path_list, fuse_args=True):
+        if isinstance(file_path_list, str):
+            file_path_list = [file_path_list]
        for file_path in file_path_list: 
            if not os.path.exists(file_path):
                raise Warning("the yaml file %s does not exist." % file_path)
                return
-            with io.open(file_path, "r", encoding="utf8") as fin: 
+            with open(file_path, "r") as fin: 
                self.yaml_config = yaml.load(fin, Loader=yaml.SafeLoader)
-                fin.close()
            if fuse_args:
                for name in self.yaml_config:
                    if not isinstance(self.yaml_config[name], int) \
@@ -265,6 +252,9 @@ class PDConfig(object):
        self.args = self.parser.parse_args()
        self.arg_config = vars(self.args)
+    def asdict(self):
+        return self.arg_config
    def __add__(self, new_arg):
        assert isinstance(new_arg, list) or isinstance(new_arg, tuple)
        assert len(new_arg) >= 3
@@ -290,7 +280,7 @@ class PDConfig(object):
        if name in self.yaml_config:
            return self.yaml_config[name]
-        raise AttributeError("The argument %s is not defined." % name)
+        raise Warning("The argument %s is not defined." % name)
    def Print(self):

--- a/paddlepalm/utils/print_helper.py
+++ b/paddlepalm/utils/print_helper.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+MAXLEN = 70
+def print_dict(dic, title=""):
+    if title:
+        title = ' ' + title + ' '
+        left_len = (MAXLEN - len(title)) // 2
+        title = '-' * left_len + title
+        right_len = MAXLEN - len(title)
+        title = title + '-' * right_len
+    else:
+        title = '-' * MAXLEN
+    print(title)
+    for name in dic:
+        print("{: <25}\t{}".format(str(name), str(dic[name])))
+    print("")
+    # print("-" * MAXLEN + '\n')
--- a/paddlepalm/utils/reader_helper.py
+++ b/paddlepalm/utils/reader_helper.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+import random
+import numpy as np
+import paddle
+from paddle import fluid
+from paddle.fluid import layers
+def _check_and_adapt_shape_dtype(rt_val, attr):
+    if not isinstance(rt_val, np.ndarray):
+        rt_val = np.array(rt_val)
+        assert rt_val.dtype != np.dtype('O'), "yielded data is not a valid tensor(number of elements on some dimension may differ)."
+        if rt_val.dtype == np.dtype('float64'):
+            rt_val = rt_val.astype('float32')
+    shape, dtype = attr
+    assert rt_val.dtype == np.dtype(dtype), "yielded data type not consistent with attr settings."
+    assert len(shape) == rt_val.ndim, "yielded data rank(ndim) not consistent with attr settings."
+    for rt, exp in zip(rt_val.shape, shape):
+        if exp is None or exp < 0:
+            continue
+        assert rt == exp, "yielded data shape is not consistent with attr settings.\nExpected:{}\nActual:{}".format(exp, rt)
+    return rt_val
+def _zero_batch(attrs):
+    pos_attrs = []
+    for shape, dtype in attrs:
+        pos_shape = [size if size and size > 0 else 1 for size in shape]
+        pos_attrs.append([pos_shape, dtype])
+    return [np.zeros(shape=shape, dtype=dtype) for shape, dtype in pos_attrs]
+def create_net_inputs(input_attrs, async=False, iterator_fn=None, dev_count=1, n_prefetch=1):
+    inputs = []
+    ret = {}
+    for name, shape, dtype in input_attrs:
+        p = layers.data(name, shape=shape, dtype=dtype)
+        ret[name] = p
+        inputs.append(p)
+    if async:
+        assert iterator_fn is not None, "iterator_fn is needed for building async input layer."
+        reader = fluid.io.PyReader(inputs, capacity=dev_count*n_prefetch, iterable=False)
+        reader.decorate_batch_generator(iterator_fn)
+        reader.start()
+    return ret
+def create_iterator_fn(iterator, iterator_prefix, shape_and_dtypes, outname_to_pos, verbose=0):
+    def iterator():
+        v = verbose
+        while True:
+            results = _zero_batch(shape_and_dtypes)
+            outputs = next(iterator) # dict type
+            prefix = iterator_prefixe
+            for outname, val in outputs.items():
+                task_outname = prefix + '/' + outname
+                if outname in outname_to_pos:
+                    idx = outname_to_pos[outname]
+                    val = _check_and_adapt_shape_dtype(val, joint_shape_and_dtypes[idx])
+                    results[idx] = val
+                if task_outname in outname_to_pos:
+                    idx = outname_to_pos[task_outname]
+                    val = _check_and_adapt_shape_dtype(val, joint_shape_and_dtypes[idx])
+                    results[idx] = val
+            yield results
+    return iterator
+def create_joint_iterator_fn(iterators, iterator_prefixes, joint_shape_and_dtypes, mrs, outname_to_pos, dev_count=1, keep_one_task=True, verbose=0):
+    """
+        joint_shape_and_dtypes: 本质上是根据bb和parad的attr设定的，并且由reader中的attr自动填充-1（可变）维度得到，因此通过与iterator的校验可以完成runtime的batch正确性检查
+    """
+    task_ids = range(len(iterators))
+    weights = [mr / float(sum(mrs)) for mr in mrs]
+    if not keep_one_task:
+        dev_count = 1
+    # build fake batch
+    # 注意这种方法会导致一个问题，用户将某任务的mix ratio设置成0后，并不能避免从该任务上读数据，若用户将数据集删掉则会导致崩溃；不过相比之前的zero batch方法，这种方法不必作出只能有一个size=-1的维度且第0维的-1必须是batch size的假设
+    results = _zero_batch(joint_shape_and_dtypes)
+    outbuf = {}
+    for id in task_ids:
+        outputs = next(iterators[id]) # dict type
+        outbuf[id] = outputs
+        prefix = iterator_prefixes[id]
+        for outname, val in outputs.items():
+            task_outname = prefix + '/' + outname
+            if outname in outname_to_pos:
+                idx = outname_to_pos[outname]
+                val = _check_and_adapt_shape_dtype(val, joint_shape_and_dtypes[idx])
+                results[idx] = val
+            if task_outname in outname_to_pos:
+                idx = outname_to_pos[task_outname]
+                val = _check_and_adapt_shape_dtype(val, joint_shape_and_dtypes[idx])
+                results[idx] = val
+    fake_batch = results
+    dev_count_bak = dev_count
+    def iterator():
+        v = verbose
+        while True:
+            id = np.random.choice(task_ids, p=weights)
+            # results = _zero_batch(joint_shape_and_dtypes)
+            results = fake_batch
+            if v > 0:
+                print('----- debug joint iterator -----')
+                print('sampled task id: '+str(id))
+            task_id_tensor = np.array([[id]]).astype("int64")
+            results[0] = task_id_tensor
+            for i in range(dev_count):
+                if id in outbuf:
+                    outputs = outbuf[id]
+                    del outbuf[id]
+                else:
+                    outputs = next(iterators[id]) # dict type
+                prefix = iterator_prefixes[id]
+                for outname, val in outputs.items():
+                    if v > 0:
+                        print('reader generate: '+outname)
+                    task_outname = prefix + '/' + outname
+                    if outname in outname_to_pos:
+                        idx = outname_to_pos[outname]
+                        if v > 0:
+                            print(outname + ' is insert in idx ' + str(idx))
+                        val = _check_and_adapt_shape_dtype(val, joint_shape_and_dtypes[idx])
+                        results[idx] = val
+                    if task_outname in outname_to_pos:
+                        idx = outname_to_pos[task_outname]
+                        if v > 0:
+                            print(task_outname + ' is insert in idx ' + str(idx))
+                        val = _check_and_adapt_shape_dtype(val, joint_shape_and_dtypes[idx])
+                        results[idx] = val
+                if v > 0:
+                    print('yielded batch len and shapes:')
+                    print(len(results))
+                    for i in results:
+                        print(np.shape(i))
+                    print('')
+                    v -= 1
+                yield results
+    return iterator
+def merge_input_attrs(backbone_attr, task_attrs, insert_taskid=True):
+    """
+    Args:
+        task_attrs(list[dict]|dict): task input attributes, key=attr_name, val=[shape, dtype], support single task and nested tasks
+    """
+    if isinstance(task_attrs, dict):
+        task_attrs = [task_attrs]
+    if insert_taskid:
+        ret = [([1,1], 'int64')]
+        names = ['__task_id']
+        start = 1
+    else:
+        ret = []
+        names = []
+        start = 0
+    names += sorted(backbone_attr.keys())
+    ret.extend([backbone_attr[k] for k in names[start:]])
+    name_to_position = {}
+    # pos=0 is for task_id, thus we start from 1
+    for pos, k in enumerate(names):
+        name_to_position[k] = pos
+    for task_attr in task_attrs:
+        task_names = sorted(task_attr.keys())
+        names.extend(task_names)
+        ret.extend([task_attr[k] for k in task_names])
+        for pos, k in enumerate(task_names, start=len(name_to_position)):
+            name_to_position[k] = pos
+    return names, ret, name_to_position
--- a/utils/init.py
+++ b/utils/init.py
+# -*- coding: UTF-8 -*-
 #   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -11,7 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# -*- coding: utf-8 -*-
 from __future__ import print_function
@@ -23,92 +23,43 @@ import copy
 import numpy as np
 import paddle.fluid as fluid
+def init_checkpoint(exe, init_checkpoint_path, main_program, skip_list = []):
-def cast_fp32_to_fp16(exe, main_program):
-    print("Cast parameters to float16 data format.")
-    for param in main_program.global_block().all_parameters():
-        if not param.name.endswith(".master"):
-            param_t = fluid.global_scope().find_var(param.name).get_tensor()
-            data = np.array(param_t)
-            if param.name.find("layer_norm") == -1:
-                param_t.set(np.float16(data).view(np.uint16), exe.place)
-            master_param_var = fluid.global_scope().find_var(param.name +
-                                                             ".master")
-            if master_param_var is not None:
-                master_param_var.get_tensor().set(data, exe.place)
-def init_checkpoint(exe, init_checkpoint_path, main_program, use_fp16=False, skip_list = []):
    assert os.path.exists(
        init_checkpoint_path), "[%s] cann't be found." % init_checkpoint_path
-    assert os.path.isdir(init_checkpoint_path), '{} is not a dir.'.format(init_checkpoint_path)
-    path = init_checkpoint_path
-    if not os.path.split(init_checkpoint_path)[-1].startswith('step_') and 'params' != os.path.split(init_checkpoint_path)[-1]:
-        max_step = 0
-        for d in os.listdir(init_checkpoint_path):
-            if os.path.isdir(os.path.join(init_checkpoint_path, d)):
-                if d.startswith('step_'):
-                    step = int(d.lstrip('step_').rstrip('_final'))
-                    if step > max_step:
-                        path = os.path.join(init_checkpoint_path, d)
-                        max_step = step
    def existed_persitables(var):
        if not fluid.io.is_persistable(var):
            return False
        if var.name in skip_list:
            return False
-        return os.path.exists(os.path.join(path, var.name))
+        return os.path.exists(os.path.join(init_checkpoint_path, var.name))
-    print("loading checkpoint from {}...".format(path))
    fluid.io.load_vars(
        exe,
-        path,
+        init_checkpoint_path,
        main_program=main_program,
        predicate=existed_persitables)
+    print("Load model from {}".format(init_checkpoint_path))
-    if use_fp16:
-        cast_fp32_to_fp16(exe, main_program)
 def init_pretraining_params(exe,
                            pretraining_params_path,
-                            main_program,
+                            main_program):
-                            use_fp16=False):
    assert os.path.exists(pretraining_params_path
                          ), "[%s] cann't be found." % pretraining_params_path
-    assert os.path.isdir(pretraining_params_path), '{} is not a dir.'.format(pretraining_params_path)
-    if os.path.exists(os.path.join(pretraining_params_path, 'params')):
-        pretraining_params_path = os.path.join(pretraining_params_path, 'params')
-    if not os.path.split(pretraining_params_path)[-1] == 'params':
-        raise Warning('Dir "params" not found in {}.'.format(pretraining_params_path))
-        max_step = 0
-        path = pretraining_params_path
-        for d in os.listdir(pretraining_params_path):
-            if os.path.isdir(os.path.join(pretraining_params_path, d)):
-                if d.startswith('step_'):
-                    step = int(d.lstrip('step_').rstrip('_final'))
-                    if step > max_step:
-                        path = os.path.join(pretraining_params_path, d)
-                        max_step = step
-        pretraining_params_path = path
-    print("loading pretrained parameters from {}...".format(
-        pretraining_params_path))
    def existed_params(var):
        if not isinstance(var, fluid.framework.Parameter):
            return False
        return os.path.exists(os.path.join(pretraining_params_path, var.name))
+    print("Load pretraining parameters from {}...\n".format(
+        pretraining_params_path))
    fluid.io.load_vars(
        exe,
        pretraining_params_path,
        main_program=main_program,
        predicate=existed_params)
-    if use_fp16:
-        cast_fp32_to_fp16(exe, main_program)
--- a/paddlepalm/utils/textprocess_helper.py
+++ b/paddlepalm/utils/textprocess_helper.py
+# -*- coding: UTF-8 -*-
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+def is_whitespace(c):
+    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+        return True
+    return False
--- a/paradigm/README.md
+++ b/paradigm/README.md
-该目录用来存放任务范式，用户可通过实现paradigm的接口完成自定义。
--- a/paradigm/mask_language_model.py
+++ b/paradigm/mask_language_model.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# -*- coding: utf-8 -*-
-import paddle.fluid as fluid
-from backbone.utils.transformer import pre_process_layer
-from utils.configure import JsonConfig
-def compute_loss(output_tensors, args=None):
-    """Compute loss for mlm model"""
-    fc_out = output_tensors['mlm_out']
-    mask_label = output_tensors['mask_label']
-    mask_lm_loss = fluid.layers.softmax_with_cross_entropy(
-        logits=fc_out, label=mask_label)
-    mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss)
-    return mean_mask_lm_loss
-def create_model(reader_input, base_model=None, is_training=True, args=None):
-    """
-        given the base model, reader_input
-        return the output tensors
-    """
-    src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos = reader_input
-    config = JsonConfig(args.pretrain_config_path)
-    _emb_size = config['hidden_size']
-    _voc_size = config['vocab_size']
-    _hidden_act = config['hidden_act']
-    _word_emb_name = "word_embedding"
-    _dtype = "float32"
-    _param_initializer = fluid.initializer.TruncatedNormal(
-        scale=config['initializer_range'])
-    mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')
-    enc_out = base_model.final_word_representation
-    # extract the first token feature in each sentence
-    reshaped_emb_out = fluid.layers.reshape(
-        x=enc_out, shape=[-1, _emb_size])
-    # extract masked tokens' feature
-    mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
-    num_seqs = fluid.layers.fill_constant(shape=[1], value=512, dtype='int64')
-    # transform: fc
-    mask_trans_feat = fluid.layers.fc(
-        input=mask_feat,
-        size=_emb_size,
-        act=_hidden_act,
-        param_attr=fluid.ParamAttr(
-            name='mask_lm_trans_fc.w_0',
-            initializer=_param_initializer),
-        bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0'))
-    # transform: layer norm
-    mask_trans_feat = pre_process_layer(
-        mask_trans_feat, 'n', name='mask_lm_trans')
-    mask_lm_out_bias_attr = fluid.ParamAttr(
-        name="mask_lm_out_fc.b_0",
-        initializer=fluid.initializer.Constant(value=0.0))
-    fc_out = fluid.layers.matmul(
-        x=mask_trans_feat,
-        y=fluid.default_main_program().global_block().var(
-            _word_emb_name),
-        transpose_y=True)
-    fc_out += fluid.layers.create_parameter(
-        shape=[_voc_size],
-        dtype=_dtype,
-        attr=mask_lm_out_bias_attr,
-        is_bias=True)
-    output_tensors = {}
-    output_tensors['num_seqs'] = num_seqs
-    output_tensors['mlm_out'] = fc_out
-    output_tensors['mask_label'] = mask_label
-    return output_tensors
--- a/paradigm/reading_comprehension.py
+++ b/paradigm/reading_comprehension.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# -*- coding: utf-8 -*-
-import paddle.fluid as fluid
-import collections
-import os
-from paddle.fluid import layers
-def compute_loss(output_tensors, args=None):
-    """Compute loss for mrc model"""
-    def _compute_single_loss(logits, positions):
-        """Compute start/end loss for mrc model"""
-        loss = fluid.layers.softmax_with_cross_entropy(
-            logits=logits, label=positions)
-        loss = fluid.layers.mean(x=loss)
-        return loss
-    start_logits = output_tensors['start_logits']
-    end_logits = output_tensors['end_logits']
-    start_positions = output_tensors['start_positions']
-    end_positions = output_tensors['end_positions']
-    start_loss = _compute_single_loss(start_logits, start_positions)
-    end_loss = _compute_single_loss(end_logits, end_positions)
-    total_loss = (start_loss + end_loss) / 2.0
-    return total_loss
-def create_model(reader_input, base_model=None, is_training=True, args=None):
-    """
-        given the base model, reader_input
-        return the output tensors
-    """
-    if is_training:
-        src_ids, pos_ids, sent_ids, input_mask, start_positions, end_positions = reader_input
-    else:
-        src_ids, pos_ids, sent_ids, input_mask, unique_id = reader_input
-    enc_out = base_model.final_word_representation
-    logits = fluid.layers.fc(
-        input=enc_out,
-        size=2,
-        num_flatten_dims=2,
-        param_attr=fluid.ParamAttr(
-            name="cls_squad_out_w",
-            initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
-        bias_attr=fluid.ParamAttr(
-            name="cls_squad_out_b", initializer=fluid.initializer.Constant(0.)))
-    logits = fluid.layers.transpose(x=logits, perm=[2, 0, 1])
-    start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0)
-    batch_ones = fluid.layers.fill_constant_batch_size_like(
-        input=start_logits, dtype='int64', shape=[1], value=1)
-    num_seqs = fluid.layers.reduce_sum(input=batch_ones)
-    output_tensors = {}
-    output_tensors['start_logits'] = start_logits
-    output_tensors['end_logits'] = end_logits
-    output_tensors['num_seqs'] = num_seqs
-    if is_training:
-        output_tensors['start_positions'] = start_positions
-        output_tensors['end_positions'] = end_positions
-    else:
-        output_tensors['unique_id'] = unique_id
-    return output_tensors
-RawResult = collections.namedtuple("RawResult",
-                                   ["unique_id", "start_logits", "end_logits"])
-def postprocess(fetch_results):
-    np_unique_ids= fetch_results['unique_id']
-    np_start_logits= fetch_results['start_logits']
-    np_end_logits= fetch_results['end_logits']
-    ret = []
-    for idx in range(np_unique_ids.shape[0]):                                                                                                                                          
-        if np_unique_ids[idx] < 0:
-            continue
-        unique_id = int(np_unique_ids[idx])
-        start_logits = [float(x) for x in np_start_logits[idx].flat]
-        end_logits = [float(x) for x in np_end_logits[idx].flat]
-        ret.append(
-            RawResult(
-                unique_id=unique_id,
-                start_logits=start_logits,
-                end_logits=end_logits))
-    return ret
-def global_postprocess(pred_buf, processor, mtl_args, task_args):
-    if not os.path.exists(mtl_args.checkpoint_path):
-        os.makedirs(mtl_args.checkpoints)
-    output_prediction_file = os.path.join(mtl_args.checkpoint_path, "predictions.json")
-    output_nbest_file = os.path.join(mtl_args.checkpoint_path, "nbest_predictions.json")
-    output_null_log_odds_file = os.path.join(mtl_args.checkpoint_path, "null_odds.json")
-    processor.write_predictions(pred_buf, task_args.n_best_size, task_args.max_answer_length,
-                                task_args.do_lower_case, output_prediction_file,
-                                output_nbest_file, output_null_log_odds_file,
-                                task_args.with_negative,
-                                task_args.null_score_diff_threshold, task_args.verbose)
--- a/pretrain_model/README.md
+++ b/pretrain_model/README.md
-该目录用于存放预训练及其配置文件，用户可通过运行`download_pretrain.sh`下载内置的预训练模型。
--- a/reader/README.md
+++ b/reader/README.md
-该目录存放数据集载入与处理模块reader，用户可通过实现相关接口完成自定义
--- a/reader/answer_matching_reader.py
+++ b/reader/answer_matching_reader.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import types
-import csv
-import numpy as np
-from utils import tokenization
-import io
-from utils.batching import prepare_batch_data
-def get_input_shape(args):
-    """
-    define answer matching model input shape
-    """
-    train_input_shape = {"backbone": [([-1, args.max_seq_len, 1], 'int64'),
-                                      ([-1, args.max_seq_len, 1], 'int64'),
-                                      ([-1, args.max_seq_len, 1], 'int64'),
-                                      ([-1, args.max_seq_len, 1], 'float32')],
-                         "task": [([-1, 1], 'int64')]
-                         }
-    test_input_shape = {"backbone": [([-1, args.max_seq_len, 1], 'int64'),
-                                     ([-1, args.max_seq_len, 1], 'int64'),
-                                     ([-1, args.max_seq_len, 1], 'int64'),
-                                     ([-1, args.max_seq_len, 1], 'float32')],
-                         "task": [([-1, 1], 'int64')]
-                         }
-    return train_input_shape, test_input_shape
-class BaseProcessor(object):
-    """Base class for data converters for sequence classification data sets."""
-    def __init__(self, args):
-        self.train_file = args.train_file
-        self.max_seq_len = args.max_seq_len
-        self.batch_size = args.batch_size
-        self.epoch = args.epoch
-        self.tokenizer = tokenization.FullTokenizer(
-            vocab_file=args.vocab_path, do_lower_case=args.do_lower_case)
-        self.vocab = self.tokenizer.vocab
-        self.in_tokens = args.in_tokens
-        self.current_train_example = -1
-        self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
-        self.current_train_epoch = -1
-    def get_train_examples(self, file_path):
-        """Gets a collection of `InputExample`s for the train set."""
-        raise NotImplementedError()
-    def get_dev_examples(self, file_path):
-        """Gets a collection of `InputExample`s for the dev set."""
-        raise NotImplementedError()
-    def get_test_examples(self, file_path):
-        """Gets a collection of `InputExample`s for prediction."""
-        raise NotImplementedError()
-    def get_labels(self):
-        """Gets the list of labels for this data set."""
-        raise NotImplementedError()
-    def convert_example(self, index, example, labels, max_seq_len, tokenizer):
-        """Converts a single `InputExample` into a single `InputFeatures`."""
-        feature = convert_single_example(index, example, labels, max_seq_len,
-                                         tokenizer)
-        return feature
-    def generate_instance(self, feature):
-        """
-        generate instance with given feature
-        Args:
-            feature: InputFeatures(object). A single set of features of data.
-        """
-        input_pos = list(range(len(feature.input_ids)))
-        return [
-            feature.input_ids, feature.segment_ids, input_pos, feature.label_id
-        ]
-    def generate_batch_data(self,
-                            batch_data,
-                            total_token_num,
-                            voc_size=-1,
-                            mask_id=-1,
-                            return_input_mask=True,
-                            return_max_len=False,
-                            return_num_token=False):
-        return prepare_batch_data(
-            batch_data,
-            total_token_num,
-            voc_size=-1,
-            pad_id=self.vocab["[PAD]"],
-            cls_id=self.vocab["[CLS]"],
-            sep_id=self.vocab["[SEP]"],
-            mask_id=-1,
-            return_input_mask=True,
-            return_max_len=False,
-            return_num_token=False)
-    @classmethod
-    def _read_tsv(cls, input_file, quotechar=None):
-        """Reads a tab separated value file."""
-        with io.open(input_file, "r", encoding="utf8") as f:
-            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
-            lines = []
-            for line in reader:
-                lines.append(line)
-            return lines
-    def get_num_examples(self, phase):
-        """Get number of examples for train, dev or test."""
-        if phase not in ['train', 'dev', 'test']:
-            raise ValueError(
-                "Unknown phase, which should be in ['train', 'dev', 'test'].")
-        return self.num_examples[phase]
-    def get_train_progress(self):
-        """Gets progress for training phase."""
-        return self.current_train_example, self.current_train_epoch
-    def data_generator(self,
-                       phase='train',
-                       dev_count=1,
-                       shuffle=True
-                       ):
-        """
-        Generate data for train, dev or test.
-        Args:
-          batch_size: int. The batch size of generated data.
-          phase: string. The phase for which to generate data.
-          epoch: int. Total epoches to generate data.
-          shuffle: bool. Whether to shuffle examples.
-        """
-        if phase == 'train':
-            examples = self.get_train_examples(self.train_file)
-            self.num_examples['train'] = len(examples)
-        elif phase == 'dev':
-            examples = self.get_dev_examples(self.dev_file)
-            self.num_examples['dev'] = len(examples)
-        elif phase == 'test':
-            examples = self.get_test_examples(self.test_file)
-            self.num_examples['test'] = len(examples)
-        else:
-            raise ValueError(
-                "Unknown phase, which should be in ['train', 'dev', 'test'].")
-        def instance_reader():
-            for epoch_index in range(self.epoch):
-                if shuffle:
-                    np.random.shuffle(examples)
-                if phase == 'train':
-                    self.current_train_epoch = epoch_index
-                for (index, example) in enumerate(examples):
-                    if phase == 'train':
-                        self.current_train_example = index + 1
-                    feature = self.convert_example(
-                        index, example,
-                        self.get_labels(), self.max_seq_len, self.tokenizer)
-                    instance = self.generate_instance(feature)
-                    yield instance
-        def batch_reader(reader, batch_size, in_tokens):
-            batch, total_token_num, max_len = [], 0, 0
-            for instance in reader():
-                token_ids, sent_ids, pos_ids, label = instance[:4]
-                max_len = max(max_len, len(token_ids))
-                if in_tokens:
-                    to_append = (len(batch) + 1) * max_len <= batch_size
-                else:
-                    to_append = len(batch) < batch_size
-                if to_append:
-                    batch.append(instance)
-                    total_token_num += len(token_ids)
-                else:
-                    yield batch, total_token_num
-                    batch, total_token_num, max_len = [instance], len(
-                        token_ids), len(token_ids)
-            if len(batch) > 0:
-                yield batch, total_token_num
-        def wrapper():
-            all_dev_batches = []
-            for batch_data, total_token_num in batch_reader(
-                    instance_reader, self.batch_size, self.in_tokens):
-                batch_data = self.generate_batch_data(
-                    batch_data,
-                    total_token_num,
-                    voc_size=-1,
-                    mask_id=-1,
-                    return_input_mask=True,
-                    return_max_len=False,
-                    return_num_token=False)
-                if len(all_dev_batches) < dev_count:
-                    all_dev_batches.append(batch_data)
-                if len(all_dev_batches) == dev_count:
-                    for batch in all_dev_batches:
-                        yield batch
-                    all_dev_batches = []
-        return wrapper
-class DataProcessor(BaseProcessor):
-    """Processor for the MultiNLI data set (GLUE version)."""
-    def get_train_examples(self, file_path):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(file_path), "train")
-    def get_dev_examples(self, file_path):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(file_path), "dev")
-    def get_test_examples(self, file_path):
-        """See base class."""
-        return self._create_examples(
-            self._read_tsv(file_path), "test")
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1"]
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            guid = "%s-%s" % (set_type,
-                              tokenization.convert_to_unicode("0000"))
-            text_a = tokenization.convert_to_unicode(line[1])
-            text_b = tokenization.convert_to_unicode(line[2])
-            if set_type == "test":
-                label = "0"
-            else:
-                label = tokenization.convert_to_unicode(line[0])
-            examples.append(
-                InputExample(
-                    guid=guid, text_a=text_a, text_b=text_b, label=label))
-        return examples
-class InputExample(object):
-    """A single training/test example for simple sequence classification."""
-    def __init__(self, guid, text_a, text_b=None, label=None):
-        """Constructs a InputExample.
-    Args:
-      guid: Unique id for the example.
-      text_a: string. The untokenized text of the first sequence. For single
-        sequence tasks, only this sequence must be specified.
-      text_b: (Optional) string. The untokenized text of the second sequence.
-        Only must be specified for sequence pair tasks.
-      label: (Optional) string. The label of the example. This should be
-        specified for train and dev examples, but not for test examples.
-    """
-        self.guid = guid
-        self.text_a = text_a
-        self.text_b = text_b
-        self.label = label
-def _truncate_seq_pair(tokens_a, tokens_b, max_length):
-    """Truncates a sequence pair in place to the maximum length."""
-    # This is a simple heuristic which will always truncate the longer sequence
-    # one token at a time. This makes more sense than truncating an equal percent
-    # of tokens from each, since if one sequence is very short then each token
-    # that's truncated likely contains more information than a longer sequence.
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_length:
-            break
-        if len(tokens_a) > len(tokens_b):
-            tokens_a.pop()
-        else:
-            tokens_b.pop()
-class InputFeatures(object):
-    """A single set of features of data."""
-    def __init__(self, input_ids, input_mask, segment_ids, label_id):
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.segment_ids = segment_ids
-        self.label_id = label_id
-def convert_single_example_to_unicode(guid, single_example):
-    text_a = tokenization.convert_to_unicode(single_example[0])
-    text_b = tokenization.convert_to_unicode(single_example[1])
-    label = tokenization.convert_to_unicode(single_example[2])
-    return InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
-def convert_single_example(ex_index, example, label_list, max_seq_length,
-                           tokenizer):
-    """Converts a single `InputExample` into a single `InputFeatures`."""
-    label_map = {}
-    for (i, label) in enumerate(label_list):
-        label_map[label] = i
-    tokens_a = tokenizer.tokenize(example.text_a)
-    tokens_b = None
-    if example.text_b:
-        tokens_b = tokenizer.tokenize(example.text_b)
-    if tokens_b:
-        # Modifies `tokens_a` and `tokens_b` in place so that the total
-        # length is less than the specified length.
-        # Account for [CLS], [SEP], [SEP] with "- 3"
-        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
-    else:
-        # Account for [CLS] and [SEP] with "- 2"
-        if len(tokens_a) > max_seq_length - 2:
-            tokens_a = tokens_a[0:(max_seq_length - 2)]
-    # The convention in BERT is:
-    # (a) For sequence pairs:
-    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-    #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
-    # (b) For single sequences:
-    #  tokens:   [CLS] the dog is hairy . [SEP]
-    #  type_ids: 0     0   0   0  0     0 0
-    #
-    # Where "type_ids" are used to indicate whether this is the first
-    # sequence or the second sequence. The embedding vectors for `type=0` and
-    # `type=1` were learned during pre-training and are added to the wordpiece
-    # embedding vector (and position vector). This is not *strictly* necessary
-    # since the [SEP] token unambiguously separates the sequences, but it makes
-    # it easier for the model to learn the concept of sequences.
-    #
-    # For classification tasks, the first vector (corresponding to [CLS]) is
-    # used as as the "sentence vector". Note that this only makes sense because
-    # the entire model is fine-tuned.
-    tokens = []
-    segment_ids = []
-    tokens.append("[CLS]")
-    segment_ids.append(0)
-    for token in tokens_a:
-        tokens.append(token)
-        segment_ids.append(0)
-    tokens.append("[SEP]")
-    segment_ids.append(0)
-    if tokens_b:
-        for token in tokens_b:
-            tokens.append(token)
-            segment_ids.append(1)
-        tokens.append("[SEP]")
-        segment_ids.append(1)
-    input_ids = tokenizer.convert_tokens_to_ids(tokens)
-    # The mask has 1 for real tokens and 0 for padding tokens. Only real
-    # tokens are attended to.
-    input_mask = [1] * len(input_ids)
-    label_id = label_map[example.label]
-    feature = InputFeatures(
-        input_ids=input_ids,
-        input_mask=input_mask,
-        segment_ids=segment_ids,
-        label_id=label_id)
-    return feature
-def convert_examples_to_features(examples, label_list, max_seq_length,
-                                 tokenizer):
-    """Convert a set of `InputExample`s to a list of `InputFeatures`."""
-    features = []
-    for (ex_index, example) in enumerate(examples):
-        if ex_index % 10000 == 0:
-            print("Writing example %d of %d" % (ex_index, len(examples)))
-        feature = convert_single_example(ex_index, example, label_list,
-                                         max_seq_length, tokenizer)
-        features.append(feature)
-    return features
-if __name__ == '__main__':
-    pass
--- a/reader/joint_reader.py
+++ b/reader/joint_reader.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# -*- coding: utf-8 -*-
-import os
-import sys
-import random
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-from utils.placeholder import Placeholder
-def repeat(reader):
-    """Repeat a generator forever"""
-    generator = reader()
-    while True:
-        try:
-            yield next(generator)
-        except StopIteration:
-            generator = reader()
-            yield next(generator)
-def create_joint_generator(input_shape, generators, task_map_id, is_multi_task=True):
-    def empty_output(input_shape, batch_size=1):
-        results = []
-        for i in range(len(input_shape)):
-            if input_shape[i][1] == 'int32':
-                dtype = np.int32
-            if input_shape[i][1] == 'int64':
-                dtype = np.int64
-            if input_shape[i][1] == 'float32':
-                dtype = np.float32
-            if input_shape[i][1] == 'float64':
-                dtype = np.float64
-            shape = input_shape[i][0]
-            shape[0] = batch_size
-            pad_tensor = np.zeros(shape=shape, dtype=dtype)
-            results.append(pad_tensor)
-        return results
-    def wrapper(): 
-        generators_inst = [repeat(gen[0]) for gen in generators]
-        generators_ratio = [gen[1] for gen in generators]
-        weights = [ratio/sum(generators_ratio) for ratio in generators_ratio]
-        task_names = [gen[2] for gen in generators]
-        task_names_ids = [0]
-        for i in range(1, len(task_names)):  
-            if task_names[i] == task_names[i - 1]: 
-                task_names_ids.append(task_names_ids[-1])
-            else: 
-                task_names_ids.append(task_names_ids[-1] + 1)
-        run_task_id = range(len(generators))
-        while True:
-            idx = np.random.choice(run_task_id, p=weights)
-            gen_results = next(generators_inst[idx])
-            if not gen_results:
-                break
-            batch_size = gen_results[0].shape[0]
-            results = empty_output(input_shape, batch_size)
-            task_id_tensor = np.array([[task_names_ids[idx]]]).astype("int64")
-            results[0] = task_id_tensor
-            backbone_range_start = task_map_id[0][0]
-            backbone_range_end = task_map_id[0][1]
-            for i in range(backbone_range_start, backbone_range_end):
-                results[i] = gen_results[i - 1]
-            cur_gene_task = task_names_ids[idx] + 1
-            for j in range(task_map_id[cur_gene_task][0], task_map_id[cur_gene_task][1]): 
-                results[j] = gen_results[i]
-                i += 1
-            yield results
-    return wrapper
-def create_reader(reader_name, input_shape, is_multi_task, task_map_id, *gens):
-    """
-    build reader for multi_task_learning
-    """
-    placeholder = Placeholder(input_shape)
-    pyreader, model_inputs = placeholder.build(capacity=16, reader_name=reader_name)
-    joint_generator = create_joint_generator(input_shape, gens[0], task_map_id, is_multi_task=is_multi_task)
-    return joint_generator, pyreader, model_inputs
-def joint_input_shape(input_shape_list): 
-    """
-    joint main task and auxiliary tasks input shape
-    """
-    joint_test_input_shape = input_shape_list[0][1]["backbone"] + input_shape_list[0][1]["task"]
-    joint_train_input_shape = [([1, 1], 'int64')] # task_id_shape
-    backbone_input_shape = input_shape_list[0][0]["backbone"]
-    joint_train_input_shape.extend(backbone_input_shape)
-    task_map_id = [(1, len(input_shape_list[0][0]["backbone"]) + 1)]
-    for input_shape in input_shape_list: 
-        task_input_shape = input_shape[0]["task"]
-        joint_train_input_shape.extend(task_input_shape)
-        task_map_id.append((task_map_id[-1][1], task_map_id[-1][1] + len(task_input_shape)))
-    return joint_train_input_shape, joint_test_input_shape, task_map_id
--- a/reader/mask_language_model_reader.py
+++ b/reader/mask_language_model_reader.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-from __future__ import division
-import os
-import numpy as np
-import types
-import gzip
-import logging
-import re
-import six
-import io
-import collections
-from utils import tokenization
-from utils.batching import prepare_batch_data
-def get_input_shape(args):
-    """
-    define mask language model input shape
-    """
-    train_input_shape = {"backbone": [([-1, args.max_seq_len, 1], 'int64'),
-                                      ([-1, args.max_seq_len, 1], 'int64'),
-                                      ([-1, args.max_seq_len, 1], 'int64'),
-                                      ([-1, args.max_seq_len, 1], 'float32')],
-                         "task": [([-1, 1], 'int64'),
-                                  ([-1, 1], 'int64')]
-                         }
-    test_input_shape = {"backbone": [([-1, args.max_seq_len, 1], 'int64'),
-                                     ([-1, args.max_seq_len, 1], 'int64'),
-                                     ([-1, args.max_seq_len, 1], 'int64'),
-                                     ([-1, args.max_seq_len, 1], 'float32')],
-                         "task": [([-1, 1], 'int64'),
-                                  ([-1, 1], 'int64')]
-                         }
-    return train_input_shape, test_input_shape
-class DataProcessor(object): 
-    def __init__(self, args): 
-        self.vocab = self.load_vocab(args.vocab_path)
-        self.data_dir = args.train_file
-        self.batch_size = args.batch_size
-        self.in_tokens = args.in_tokens
-        self.epoch = args.epoch
-        self.current_epoch = 0
-        self.current_file_index = 0
-        self.total_file = 0
-        self.current_file = None
-        self.generate_neg_sample = args.generate_neg_sample
-        self.voc_size = len(self.vocab)
-        self.max_seq_len = args.max_seq_len
-        self.pad_id = self.vocab["[PAD]"]
-        self.cls_id = self.vocab["[CLS]"]
-        self.sep_id = self.vocab["[SEP]"]
-        self.mask_id = self.vocab["[MASK]"]
-        if self.in_tokens:
-            assert self.batch_size >= self.max_seq_len, "The number of " \
-                   "tokens in batch should not be smaller than max seq length."
-    def get_progress(self):
-        """return current progress of training data
-        """
-        return self.current_epoch, self.current_file_index, self.total_file, self.current_file
-    def parse_line(self, line, max_seq_len=512):
-        """ parse one line to token_ids, sentence_ids, pos_ids, label
-        """
-        line = line.strip().decode().split(";")
-        assert len(line) == 4, "One sample must have 4 fields!"
-        (token_ids, sent_ids, pos_ids, label) = line
-        token_ids = [int(token) for token in token_ids.split(" ")]
-        sent_ids = [int(token) for token in sent_ids.split(" ")]
-        pos_ids = [int(token) for token in pos_ids.split(" ")]
-        assert len(token_ids) == len(sent_ids) == len(
-            pos_ids
-        ), "[Must be true]len(token_ids) == len(sent_ids) == len(pos_ids)"
-        label = int(label)
-        if len(token_ids) > max_seq_len:
-            return None
-        return [token_ids, sent_ids, pos_ids, label]
-    def read_file(self, file):
-        assert file.endswith('.gz'), "[ERROR] %s is not a gzip file" % file
-        file_path = self.data_dir + "/" + file
-        with gzip.open(file_path, "rb") as f:
-            for line in f:
-                parsed_line = self.parse_line(
-                    line, max_seq_len=self.max_seq_len)
-                if parsed_line is None:
-                    continue
-                yield parsed_line
-    def convert_to_unicode(self, text):
-        """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
-        if six.PY3:
-            if isinstance(text, str):
-                return text
-            elif isinstance(text, bytes):
-                return text.decode("utf-8", "ignore")
-            else:
-                raise ValueError("Unsupported string type: %s" % (type(text)))
-        elif six.PY2:
-            if isinstance(text, str):
-                return text.decode("utf-8", "ignore")
-            elif isinstance(text, unicode):
-                return text
-            else:
-                raise ValueError("Unsupported string type: %s" % (type(text)))
-        else:
-            raise ValueError("Not running on Python2 or Python 3?")
-    def load_vocab(self, vocab_file):
-        """Loads a vocabulary file into a dictionary."""
-        vocab = collections.OrderedDict()
-        fin = io.open(vocab_file, encoding='utf8')
-        for num, line in enumerate(fin):
-            items = self.convert_to_unicode(line.strip()).split("\t")
-            if len(items) > 2:
-                break
-            token = items[0]
-            index = items[1] if len(items) == 2 else num
-            token = token.strip()
-            vocab[token] = int(index)
-        return vocab
-    def random_pair_neg_samples(self, pos_samples):
-        """ randomly generate negative samples using pos_samples
-            Args:
-                pos_samples: list of positive samples
-            Returns:
-                neg_samples: list of negative samples
-        """
-        np.random.shuffle(pos_samples)
-        num_sample = len(pos_samples)
-        neg_samples = []
-        miss_num = 0
-        for i in range(num_sample):
-            pair_index = (i + 1) % num_sample
-            origin_src_ids = pos_samples[i][0]
-            origin_sep_index = origin_src_ids.index(2)
-            pair_src_ids = pos_samples[pair_index][0]
-            pair_sep_index = pair_src_ids.index(2)
-            src_ids = origin_src_ids[:origin_sep_index + 1] + pair_src_ids[
-                pair_sep_index + 1:]
-            if len(src_ids) >= self.max_seq_len:
-                miss_num += 1
-                continue
-            sent_ids = [0] * len(origin_src_ids[:origin_sep_index + 1]) + [
-                1
-            ] * len(pair_src_ids[pair_sep_index + 1:])
-            pos_ids = list(range(len(src_ids)))
-            neg_sample = [src_ids, sent_ids, pos_ids, 0]
-            assert len(src_ids) == len(sent_ids) == len(
-                pos_ids
-            ), "[ERROR]len(src_id) == lne(sent_id) == len(pos_id) must be True"
-            neg_samples.append(neg_sample)
-        return neg_samples, miss_num
-    def mixin_negative_samples(self, pos_sample_generator, buffer=1000):
-        """ 1. generate negative samples by randomly group sentence_1 and sentence_2 of positive samples
-            2. combine negative samples and positive samples
-            Args:
-                pos_sample_generator: a generator producing a parsed positive sample, which is a list: [token_ids, sent_ids, pos_ids, 1]
-            Returns:
-                sample: one sample from shuffled positive samples and negative samples
-        """
-        pos_samples = []
-        num_total_miss = 0
-        pos_sample_num = 0
-        try:
-            while True:
-                while len(pos_samples) < buffer:
-                    pos_sample = next(pos_sample_generator)
-                    label = pos_sample[3]
-                    assert label == 1, "positive sample's label must be 1"
-                    pos_samples.append(pos_sample)
-                    pos_sample_num += 1
-                neg_samples, miss_num = self.random_pair_neg_samples(
-                    pos_samples)
-                num_total_miss += miss_num
-                samples = pos_samples + neg_samples
-                pos_samples = []
-                np.random.shuffle(samples)
-                for sample in samples:
-                    yield sample
-        except StopIteration:
-            print("stopiteration: reach end of file")
-            if len(pos_samples) == 1:
-                yield pos_samples[0]
-            elif len(pos_samples) == 0:
-                yield None
-            else:
-                neg_samples, miss_num = self.random_pair_neg_samples(
-                    pos_samples)
-                num_total_miss += miss_num
-                samples = pos_samples + neg_samples
-                pos_samples = []
-                np.random.shuffle(samples)
-                for sample in samples:
-                    yield sample
-            print("miss_num:%d\tideal_total_sample_num:%d\tmiss_rate:%f" %
-                  (num_total_miss, pos_sample_num * 2,
-                   num_total_miss / (pos_sample_num * 2)))
-    def data_generator(self, phase='train', shuffle=True, dev_count=1):
-        """
-        data_generator
-        """
-        files = os.listdir(self.data_dir)
-        self.total_file = len(files)
-        assert self.total_file > 0, "[Error] data_dir is empty"
-        def wrapper():
-            def reader(): 
-                if phase == "train": 
-                    epoch_num = self.epoch
-                    is_test = False
-                else: 
-                    epoch_num = 1
-                    is_test = True
-                for epoch in range(epoch_num):
-                    self.current_epoch = epoch + 1
-                    if shuffle:
-                        np.random.shuffle(files)
-                    for index, file in enumerate(files):
-                        self.current_file_index = index + 1
-                        self.current_file = file
-                        sample_generator = self.read_file(file)
-                        if not is_test and self.generate_neg_sample:
-                            sample_generator = self.mixin_negative_samples(
-                                sample_generator)
-                        for sample in sample_generator:
-                            if sample is None:
-                                continue
-                            yield sample
-            def batch_reader(reader, batch_size, in_tokens):
-                batch, total_token_num, max_len = [], 0, 0
-                for parsed_line in reader():
-                    token_ids, sent_ids, pos_ids, label = parsed_line
-                    max_len = max(max_len, len(token_ids))
-                    if in_tokens:
-                        to_append = (len(batch) + 1) * max_len <= batch_size
-                    else:
-                        to_append = len(batch) < batch_size
-                    if to_append:
-                        batch.append(parsed_line)
-                        total_token_num += len(token_ids)
-                    else:
-                        yield batch, total_token_num
-                        batch, total_token_num, max_len = [parsed_line], len(
-                            token_ids), len(token_ids)
-                if len(batch) > 0:
-                    yield batch, total_token_num
-            for batch_data, total_token_num in batch_reader(
-                    reader, self.batch_size, self.in_tokens):
-                yield prepare_batch_data(
-                    batch_data,
-                    total_token_num,
-                    voc_size=self.voc_size,
-                    pad_id=self.pad_id,
-                    cls_id=self.cls_id,
-                    sep_id=self.sep_id,
-                    mask_id=self.mask_id,
-                    max_len=self.max_seq_len,
-                    return_input_mask=True,
-                    return_max_len=False,
-                    return_num_token=False)
-        return wrapper
-if __name__ == "__main__":
-    pass
--- a/reader/reading_comprehension_reader.py
+++ b/reader/reading_comprehension_reader.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Run MRQA"""
-import six
-import io
-import math
-import json
-import random
-import io
-import collections
-import numpy as np
-from utils import tokenization
-from utils.batching import prepare_batch_data
-def get_input_shape(args): 
-    """
-    define mrqa input shape
-    """
-    train_input_shape = {"backbone": [([-1, args.max_seq_len, 1], 'int64'), 
-                                      ([-1, args.max_seq_len, 1], 'int64'),
-                                      ([-1, args.max_seq_len, 1], 'int64'),
-                                      ([-1, args.max_seq_len, 1], 'float32')], 
-                         "task": [([-1, 1], 'int64'),
-                                 ([-1, 1], 'int64')]
-                         }
-    test_input_shape = {"backbone": [([-1, args.max_seq_len, 1], 'int64'), 
-                                    ([-1, args.max_seq_len, 1], 'int64'),
-                                    ([-1, args.max_seq_len, 1], 'int64'),
-                                    ([-1, args.max_seq_len, 1], 'float32')], 
-                        "task": [([-1, 1], 'int64')]
-                        }
-    return train_input_shape, test_input_shape
-class DataProcessor(object): 
-    def __init__(self, args): 
-        self._tokenizer = tokenization.FullTokenizer(
-            vocab_file=args.vocab_path, do_lower_case=args.do_lower_case)
-        self._max_seq_length = args.max_seq_len
-        self._doc_stride = args.doc_stride
-        self._max_query_length = args.max_query_length
-        self._in_tokens = args.in_tokens
-        self._train_file = args.train_file
-        self._predict_file = args.predict_file
-        self._batch_size = args.batch_size
-        self._with_negative = args.with_negative
-        self._epoch = args.epoch
-        self._sample_rate = args.sample_rate
-        self.vocab = self._tokenizer.vocab
-        self.vocab_size = len(self.vocab)
-        self.pad_id = self.vocab["[PAD]"]
-        self.cls_id = self.vocab["[CLS]"]
-        self.sep_id = self.vocab["[SEP]"]
-        self.mask_id = self.vocab["[MASK]"]
-        self.current_train_example = -1
-        self.num_train_examples = -1
-        self.current_train_epoch = -1
-        self.train_examples = None
-        self.predict_examples = None
-        self.predict_features = None
-        self.num_examples = {'train': -1, 'predict': -1}
-    def get_train_progress(self):
-        """Gets progress for training phase."""
-        return self.current_train_example, self.current_train_epoch
-    def get_examples(self,
-                     data_path,
-                     is_training,
-                     with_negative=False):
-        examples = read_mrqa_examples(
-            input_file=data_path,
-            is_training=is_training,
-            with_negative=with_negative)
-        return examples
-    def get_num_examples(self):
-        """Noted that this API Only support for Training phase."""
-        return estimate_runtime_examples(self._train_file, self._sample_rate, self._tokenizer, \
-                                  self._max_seq_length, self._doc_stride, self._max_query_length, \
-                                  remove_impossible_questions=True, filter_invalid_spans=True)
-    def get_features(self, examples, is_training, n_print=0):
-        features = convert_examples_to_features(
-            examples=examples,
-            tokenizer=self._tokenizer,
-            max_seq_length=self._max_seq_length,
-            doc_stride=self._doc_stride,
-            max_query_length=self._max_query_length,
-            is_training=is_training,
-            n_print=n_print)
-        return features
-    def data_generator(self,
-                       phase='train',
-                       shuffle=False,
-                       dev_count=1): 
-        if phase == 'train':
-            self.train_examples = self.get_examples(
-                self._train_file,
-                is_training=True,
-                with_negative=self._with_negative)
-            examples = self.train_examples
-            self.num_examples['train'] = len(self.train_examples)
-        elif phase == 'predict':
-            self.predict_examples = self.get_examples(
-                self._predict_file,
-                is_training=False,
-                with_negative=self._with_negative)
-            examples = self.predict_examples
-            self.num_examples['predict'] = len(self.predict_examples)
-        else:
-            raise ValueError(
-                "Unknown phase, which should be in ['train', 'predict'].")
-        def batch_reader(features, batch_size, in_tokens):
-            batch, total_token_num, max_len = [], 0, 0
-            for (index, feature) in enumerate(features):
-                if phase == 'train':
-                    self.current_train_example = index + 1
-                seq_len = len(feature.input_ids)
-                labels = [feature.unique_id
-                          ] if feature.start_position is None else [
-                              feature.start_position, feature.end_position
-                          ]
-                example = [
-                    feature.input_ids, feature.segment_ids, range(seq_len)
-                ] + labels
-                max_len = max(max_len, seq_len)
-                if in_tokens:
-                    to_append = (len(batch) + 1) * max_len <= batch_size
-                else:
-                    to_append = len(batch) < batch_size
-                if to_append:
-                    batch.append(example)
-                    total_token_num += seq_len
-                else:
-                    yield batch, total_token_num
-                    batch, total_token_num, max_len = [example
-                                                       ], seq_len, seq_len
-            if len(batch) > 0:
-                yield batch, total_token_num
-        def wrapper(): 
-            if phase == "train": 
-                epoch = self._epoch
-            else: 
-                epoch = 1
-            for epoch_index in range(epoch):
-                if epoch_index == 0:
-                    n_print = 2
-                else:
-                    n_print = 0
-                if shuffle:
-                    random.shuffle(examples)
-                if phase == 'train':
-                    self.current_train_epoch = epoch_index
-                    features = self.get_features(examples, is_training=True, n_print=n_print)
-                else:
-                    features = self.get_features(examples, is_training=False, n_print=n_print)
-                    # CAUSIOUS! cannot be repeated called, 'cause it's a generator!
-                all_dev_batches = []
-                for batch_data, total_token_num in batch_reader(
-                        features, self._batch_size, self._in_tokens):
-                    batch_data = prepare_batch_data(
-                        batch_data,
-                        total_token_num,
-                        max_len=self._max_seq_length,
-                        voc_size=-1,
-                        pad_id=self.pad_id,
-                        cls_id=self.cls_id,
-                        sep_id=self.sep_id,
-                        mask_id=-1,
-                        return_input_mask=True,
-                        return_max_len=False,
-                        return_num_token=False)
-                    if len(all_dev_batches) < dev_count:
-                        all_dev_batches.append(batch_data)
-                    if len(all_dev_batches) == dev_count:
-                        for batch in all_dev_batches:
-                            yield batch
-                        all_dev_batches = []
-                if phase == 'predict' and len(all_dev_batches) > 0:
-                    fake_batch = all_dev_batches[-1]
-                    fake_batch = fake_batch[:-1] + [np.array([-1]*len(fake_batch[0]))]
-                    all_dev_batches = all_dev_batches + [fake_batch] * (dev_count - len(all_dev_batches))
-                    for batch in all_dev_batches:
-                        yield batch
-        return wrapper
-    def write_predictions(self, all_results, n_best_size,
-                          max_answer_length, do_lower_case, output_prediction_file,
-                          output_nbest_file, output_null_log_odds_file,
-                          with_negative, null_score_diff_threshold,
-                          verbose):
-        """Write final predictions to the json file and log-odds of null if needed."""
-        print("Writing predictions to: %s" % (output_prediction_file))
-        print("Writing nbest to: %s" % (output_nbest_file))
-        all_examples = self.predict_examples
-        all_features = self.get_features(all_examples, is_training=False, n_print=0)
-        example_index_to_features = collections.defaultdict(list)
-        for feature in all_features:
-            example_index_to_features[feature.example_index].append(feature)
-        unique_id_to_result = {}
-        for result in all_results:
-            unique_id_to_result[result.unique_id] = result
-        _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-            "PrelimPrediction", [
-                "feature_index", "start_index", "end_index", "start_logit",
-                "end_logit"
-            ])
-        all_predictions = collections.OrderedDict()
-        all_nbest_json = collections.OrderedDict()
-        scores_diff_json = collections.OrderedDict()
-        for (example_index, example) in enumerate(all_examples):
-            features = example_index_to_features[example_index]
-            prelim_predictions = []
-            # keep track of the minimum score of null start+end of position 0
-            score_null = 1000000  # large and positive
-            min_null_feature_index = 0  # the paragraph slice with min mull score
-            null_start_logit = 0  # the start logit at the slice with min null score
-            null_end_logit = 0  # the end logit at the slice with min null score
-            for (feature_index, feature) in enumerate(features):
-                result = unique_id_to_result[feature.unique_id]
-                start_indexes = _get_best_indexes(result.start_logits, n_best_size)
-                end_indexes = _get_best_indexes(result.end_logits, n_best_size)
-                # if we could have irrelevant answers, get the min score of irrelevant
-                if with_negative:
-                    feature_null_score = result.start_logits[0] + result.end_logits[
-                        0]
-                    if feature_null_score < score_null:
-                        score_null = feature_null_score
-                        min_null_feature_index = feature_index
-                        null_start_logit = result.start_logits[0]
-                        null_end_logit = result.end_logits[0]
-                for start_index in start_indexes:
-                    for end_index in end_indexes:
-                        # We could hypothetically create invalid predictions, e.g., predict
-                        # that the start of the span is in the question. We throw out all
-                        # invalid predictions.
-                        if start_index >= len(feature.tokens):
-                            continue
-                        if end_index >= len(feature.tokens):
-                            continue
-                        if start_index not in feature.token_to_orig_map:
-                            continue
-                        if end_index not in feature.token_to_orig_map:
-                            continue
-                        if not feature.token_is_max_context.get(start_index, False):
-                            continue
-                        if end_index < start_index:
-                            continue
-                        length = end_index - start_index + 1
-                        if length > max_answer_length:
-                            continue
-                        prelim_predictions.append(
-                            _PrelimPrediction(
-                                feature_index=feature_index,
-                                start_index=start_index,
-                                end_index=end_index,
-                                start_logit=result.start_logits[start_index],
-                                end_logit=result.end_logits[end_index]))
-            if with_negative:
-                prelim_predictions.append(
-                    _PrelimPrediction(
-                        feature_index=min_null_feature_index,
-                        start_index=0,
-                        end_index=0,
-                        start_logit=null_start_logit,
-                        end_logit=null_end_logit))
-            prelim_predictions = sorted(
-                prelim_predictions,
-                key=lambda x: (x.start_logit + x.end_logit),
-                reverse=True)
-            _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-                "NbestPrediction", ["text", "start_logit", "end_logit"])
-            seen_predictions = {}
-            nbest = []
-            for pred in prelim_predictions:
-                if len(nbest) >= n_best_size:
-                    break
-                feature = features[pred.feature_index]
-                if pred.start_index > 0:  # this is a non-null prediction
-                    tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1
-                                                                  )]
-                    orig_doc_start = feature.token_to_orig_map[pred.start_index]
-                    orig_doc_end = feature.token_to_orig_map[pred.end_index]
-                    orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end +
-                                                                     1)]
-                    tok_text = " ".join(tok_tokens)
-                    # De-tokenize WordPieces that have been split off.
-                    tok_text = tok_text.replace(" ##", "")
-                    tok_text = tok_text.replace("##", "")
-                    # Clean whitespace
-                    tok_text = tok_text.strip()
-                    tok_text = " ".join(tok_text.split())
-                    orig_text = " ".join(orig_tokens)
-                    final_text = get_final_text(tok_text, orig_text, do_lower_case,
-                                                verbose)
-                    if final_text in seen_predictions:
-                        continue
-                    seen_predictions[final_text] = True
-                else:
-                    final_text = ""
-                    seen_predictions[final_text] = True
-                nbest.append(
-                    _NbestPrediction(
-                        text=final_text,
-                        start_logit=pred.start_logit,
-                        end_logit=pred.end_logit))
-            # if we didn't inlude the empty option in the n-best, inlcude it
-            if with_negative:
-                if "" not in seen_predictions:
-                    nbest.append(
-                        _NbestPrediction(
-                            text="",
-                            start_logit=null_start_logit,
-                            end_logit=null_end_logit))
-            # In very rare edge cases we could have no valid predictions. So we
-            # just create a nonce prediction in this case to avoid failure.
-            if not nbest:
-                nbest.append(
-                    _NbestPrediction(
-                        text="empty", start_logit=0.0, end_logit=0.0))
-            assert len(nbest) >= 1
-            total_scores = []
-            best_non_null_entry = None
-            for entry in nbest:
-                total_scores.append(entry.start_logit + entry.end_logit)
-                if not best_non_null_entry:
-                    if entry.text:
-                        best_non_null_entry = entry
-            # debug
-            if best_non_null_entry is None:
-                print("Emmm..., sth wrong")
-            probs = _compute_softmax(total_scores)
-            nbest_json = []
-            for (i, entry) in enumerate(nbest):
-                output = collections.OrderedDict()
-                output["text"] = entry.text
-                output["probability"] = probs[i]
-                output["start_logit"] = entry.start_logit
-                output["end_logit"] = entry.end_logit
-                nbest_json.append(output)
-            assert len(nbest_json) >= 1
-            if not with_negative:
-                all_predictions[example.qas_id] = nbest_json[0]["text"]
-            else:
-                # predict "" iff the null score - the score of best non-null > threshold
-                score_diff = score_null - best_non_null_entry.start_logit - (
-                    best_non_null_entry.end_logit)
-                scores_diff_json[example.qas_id] = score_diff
-                if score_diff > null_score_diff_threshold:
-                    all_predictions[example.qas_id] = ""
-                else:
-                    all_predictions[example.qas_id] = best_non_null_entry.text
-            all_nbest_json[example.qas_id] = nbest_json
-        with io.open(output_prediction_file, "w", encoding="utf8") as writer:
-            writer.write(json.dumps(all_predictions, indent=4) + "\n")
-        with io.open(output_nbest_file, "w", encoding="utf8") as writer:
-            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
-        if with_negative:
-            with io.open(output_null_log_odds_file, "w", encoding="utf8") as writer:
-                writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
-class MRQAExample(object):
-    """A single training/test example for simple sequence classification.
-     For examples without an answer, the start and end position are -1.
-  """
-    def __init__(self,
-                 qas_id,
-                 question_text,
-                 doc_tokens,
-                 orig_answer_text=None,
-                 start_position=None,
-                 end_position=None,
-                 is_impossible=False):
-        self.qas_id = qas_id
-        self.question_text = question_text
-        self.doc_tokens = doc_tokens
-        self.orig_answer_text = orig_answer_text
-        self.start_position = start_position
-        self.end_position = end_position
-        self.is_impossible = is_impossible
-    def __str__(self):
-        return self.__repr__()
-    def __repr__(self):
-        s = ""
-        s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
-        s += ", question_text: %s" % (
-            tokenization.printable_text(self.question_text))
-        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
-        if self.start_position:
-            s += ", start_position: %d" % (self.start_position)
-        if self.start_position:
-            s += ", end_position: %d" % (self.end_position)
-        if self.start_position:
-            s += ", is_impossible: %r" % (self.is_impossible)
-        return s
-class InputFeatures(object):
-    """A single set of features of data."""
-    def __init__(self,
-                 unique_id,
-                 example_index,
-                 doc_span_index,
-                 tokens,
-                 token_to_orig_map,
-                 token_is_max_context,
-                 input_ids,
-                 input_mask,
-                 segment_ids,
-                 start_position=None,
-                 end_position=None,
-                 is_impossible=None):
-        self.unique_id = unique_id
-        self.example_index = example_index
-        self.doc_span_index = doc_span_index
-        self.tokens = tokens
-        self.token_to_orig_map = token_to_orig_map
-        self.token_is_max_context = token_is_max_context
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.segment_ids = segment_ids
-        self.start_position = start_position
-        self.end_position = end_position
-        self.is_impossible = is_impossible
-def read_mrqa_examples(input_file, is_training, with_negative=False):
-    """Read a MRQA json file into a list of MRQAExample."""
-    phase = 'training' if is_training else 'testing'
-    print("loading mrqa {} data...".format(phase))
-    with io.open(input_file, "r", encoding="utf8") as reader:
-        input_data = json.load(reader)["data"]
-    def is_whitespace(c):
-        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
-            return True
-        return False
-    examples = []
-    for entry in input_data:
-        for paragraph in entry["paragraphs"]:
-            paragraph_text = paragraph["context"]
-            doc_tokens = []
-            char_to_word_offset = []
-            prev_is_whitespace = True
-            for c in paragraph_text:
-                if is_whitespace(c):
-                    prev_is_whitespace = True
-                else:
-                    if prev_is_whitespace:
-                        doc_tokens.append(c)
-                    else:
-                        doc_tokens[-1] += c
-                    prev_is_whitespace = False
-                char_to_word_offset.append(len(doc_tokens) - 1)
-            for qa in paragraph["qas"]:
-                qas_id = qa["id"]
-                question_text = qa["question"]
-                start_position = None
-                end_position = None
-                orig_answer_text = None
-                is_impossible = False
-                if is_training:
-                    if with_negative:
-                        is_impossible = qa["is_impossible"]
-                    if (len(qa["answers"]) != 1) and (not is_impossible):
-                        raise ValueError(
-                            "For training, each question should have exactly 1 answer."
-                        )
-                    if not is_impossible:
-                        answer = qa["answers"][0]
-                        orig_answer_text = answer["text"]
-                        answer_offset = answer["answer_start"]
-                        answer_length = len(orig_answer_text)
-                        start_position = char_to_word_offset[answer_offset]
-                        end_position = char_to_word_offset[answer_offset +
-                                                           answer_length - 1]
-                        # Only add answers where the text can be exactly recovered from the
-                        # document. If this CAN'T happen it's likely due to weird Unicode
-                        # stuff so we will just skip the example.
-                        #
-                        # Note that this means for training mode, every example is NOT
-                        # guaranteed to be preserved.
-                        actual_text = " ".join(doc_tokens[start_position:(
-                            end_position + 1)])
-                        cleaned_answer_text = " ".join(
-                            tokenization.whitespace_tokenize(orig_answer_text))
-                        if actual_text.find(cleaned_answer_text) == -1:
-                            print("Could not find answer: '%s' vs. '%s'",
-                                  actual_text, cleaned_answer_text)
-                            continue
-                    else:
-                        start_position = -1
-                        end_position = -1
-                        orig_answer_text = ""
-                example = MRQAExample(
-                    qas_id=qas_id,
-                    question_text=question_text,
-                    doc_tokens=doc_tokens,
-                    orig_answer_text=orig_answer_text,
-                    start_position=start_position,
-                    end_position=end_position,
-                    is_impossible=is_impossible)
-                examples.append(example)
-    return examples
-def convert_examples_to_features(
-        examples,
-        tokenizer,
-        max_seq_length,
-        doc_stride,
-        max_query_length,
-        is_training,
-        n_print=0
-):
-    """Loads a data file into a list of `InputBatch`s."""
-    unique_id = 1000000000
-    for (example_index, example) in enumerate(examples):
-        query_tokens = tokenizer.tokenize(example.question_text)
-        if len(query_tokens) > max_query_length:
-            query_tokens = query_tokens[0:max_query_length]
-        tok_to_orig_index = []
-        orig_to_tok_index = []
-        all_doc_tokens = []
-        for (i, token) in enumerate(example.doc_tokens):
-            orig_to_tok_index.append(len(all_doc_tokens))
-            sub_tokens = tokenizer.tokenize(token)
-            for sub_token in sub_tokens:
-                tok_to_orig_index.append(i)
-                all_doc_tokens.append(sub_token)
-        tok_start_position = None
-        tok_end_position = None
-        if is_training and example.is_impossible:
-            tok_start_position = -1
-            tok_end_position = -1
-        if is_training and not example.is_impossible:
-            tok_start_position = orig_to_tok_index[example.start_position]
-            if example.end_position < len(example.doc_tokens) - 1:
-                tok_end_position = orig_to_tok_index[example.end_position +
-                                                     1] - 1
-            else:
-                tok_end_position = len(all_doc_tokens) - 1
-            (tok_start_position, tok_end_position) = _improve_answer_span(
-                all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
-                example.orig_answer_text)
-        # The -3 accounts for [CLS], [SEP] and [SEP]
-        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
-        # We can have documents that are longer than the maximum sequence length.
-        # To deal with this we do a sliding window approach, where we take chunks
-        # of the up to our max length with a stride of `doc_stride`.
-        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
-            "DocSpan", ["start", "length"])
-        doc_spans = []
-        start_offset = 0
-        while start_offset < len(all_doc_tokens):
-            length = len(all_doc_tokens) - start_offset
-            if length > max_tokens_for_doc:
-                length = max_tokens_for_doc
-            doc_spans.append(_DocSpan(start=start_offset, length=length))
-            if start_offset + length == len(all_doc_tokens):
-                break
-            start_offset += min(length, doc_stride)
-        for (doc_span_index, doc_span) in enumerate(doc_spans):
-            tokens = []
-            token_to_orig_map = {}
-            token_is_max_context = {}
-            segment_ids = []
-            tokens.append("[CLS]")
-            segment_ids.append(0)
-            for token in query_tokens:
-                tokens.append(token)
-                segment_ids.append(0)
-            tokens.append("[SEP]")
-            segment_ids.append(0)
-            for i in range(doc_span.length):
-                split_token_index = doc_span.start + i
-                token_to_orig_map[len(tokens)] = tok_to_orig_index[
-                    split_token_index]
-                is_max_context = _check_is_max_context(
-                    doc_spans, doc_span_index, split_token_index)
-                token_is_max_context[len(tokens)] = is_max_context
-                tokens.append(all_doc_tokens[split_token_index])
-                segment_ids.append(1)
-            tokens.append("[SEP]")
-            segment_ids.append(1)
-            input_ids = tokenizer.convert_tokens_to_ids(tokens)
-            # The mask has 1 for real tokens and 0 for padding tokens. Only real
-            # tokens are attended to.
-            input_mask = [1] * len(input_ids)
-            start_position = None
-            end_position = None
-            if is_training and not example.is_impossible:
-                # For training, if our document chunk does not contain an annotation
-                # we throw it out, since there is nothing to predict.
-                doc_start = doc_span.start
-                doc_end = doc_span.start + doc_span.length - 1
-                out_of_span = False
-                if not (tok_start_position >= doc_start and
-                        tok_end_position <= doc_end):
-                    out_of_span = True
-                if out_of_span:
-                    start_position = 0
-                    end_position = 0
-                    continue
-                else:
-                    doc_offset = len(query_tokens) + 2
-                    start_position = tok_start_position - doc_start + doc_offset
-                    end_position = tok_end_position - doc_start + doc_offset
-            if is_training and example.is_impossible:
-                start_position = 0
-                end_position = 0
-            if n_print > 0:
-                n_print -= 1
-                print("*** Example ***")
-                print("unique_id: %s" % (unique_id))
-                print("example_index: %s" % (example_index))
-                print("doc_span_index: %s" % (doc_span_index))
-                print("tokens: %s" % " ".join(
-                    [tokenization.printable_text(x) for x in tokens]))
-                print("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-                print("input_mask: %s" % " ".join([str(x) for x in input_mask]))
-                print("segment_ids: %s" %
-                      " ".join([str(x) for x in segment_ids]))
-                if is_training and example.is_impossible:
-                    print("impossible example")
-                if is_training and not example.is_impossible:
-                    answer_text = " ".join(tokens[start_position:(end_position +
-                                                                  1)])
-                    print("start_position: %d" % (start_position))
-                    print("end_position: %d" % (end_position))
-                    print("answer: %s" %
-                          (tokenization.printable_text(answer_text)))
-            feature = InputFeatures(
-                unique_id=unique_id,
-                example_index=example_index,
-                doc_span_index=doc_span_index,
-                tokens=tokens,
-                token_to_orig_map=token_to_orig_map,
-                token_is_max_context=token_is_max_context,
-                input_ids=input_ids,
-                input_mask=input_mask,
-                segment_ids=segment_ids,
-                start_position=start_position,
-                end_position=end_position,
-                is_impossible=example.is_impossible)
-            unique_id += 1
-            yield feature
-def estimate_runtime_examples(data_path, sample_rate, tokenizer, \
-                              max_seq_length, doc_stride, max_query_length, \
-                              remove_impossible_questions=True, filter_invalid_spans=True):
-    """Count runtime examples which may differ from number of raw samples due to sliding window operation and etc.. This is useful to get correct warmup steps for training."""
-    assert sample_rate > 0.0 and sample_rate <= 1.0, "sample_rate must be set between 0.0~1.0"
-    print("loading data with json parser...")
-    with io.open(data_path, "r", encoding="utf8") as reader:
-        data = json.load(reader)["data"]
-    num_raw_examples = 0
-    for entry in data:
-        for paragraph in entry["paragraphs"]:
-            paragraph_text = paragraph["context"]
-            for qa in paragraph["qas"]:
-                num_raw_examples += 1
-    print("num raw examples:{}".format(num_raw_examples))
-    def is_whitespace(c):
-        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
-            return True
-        return False
-    sampled_examples = []
-    for entry in data:
-        for paragraph in entry["paragraphs"]:
-            doc_tokens = None
-            for qa in paragraph["qas"]:
-                if sampled_examples and random.random() > sample_rate and sample_rate < 1.0:
-                    continue
-                if doc_tokens is None:
-                    paragraph_text = paragraph["context"]
-                    doc_tokens = []
-                    char_to_word_offset = []
-                    prev_is_whitespace = True
-                    for c in paragraph_text:
-                        if is_whitespace(c):
-                            prev_is_whitespace = True
-                        else:
-                            if prev_is_whitespace:
-                                doc_tokens.append(c)
-                            else:
-                                doc_tokens[-1] += c
-                            prev_is_whitespace = False
-                        char_to_word_offset.append(len(doc_tokens) - 1)
-                assert len(qa["answers"]) == 1, "For training, each question should have exactly 1 answer."
-                qas_id = qa["id"]
-                question_text = qa["question"]
-                start_position = None
-                end_position = None
-                orig_answer_text = None
-                is_impossible = False
-                if ('is_impossible' in qa) and (qa["is_impossible"]):
-                    if remove_impossible_questions or filter_invalid_spans:
-                        continue
-                    else:
-                        start_position = -1
-                        end_position = -1
-                        orig_answer_text = ""
-                        is_impossible = True
-                else:
-                    answer = qa["answers"][0]
-                    orig_answer_text = answer["text"]
-                    answer_offset = answer["answer_start"]
-                    answer_length = len(orig_answer_text)
-                    start_position = char_to_word_offset[answer_offset]
-                    end_position = char_to_word_offset[answer_offset +
-                                                       answer_length - 1]
-                    # remove corrupt samples
-                    actual_text = " ".join(doc_tokens[start_position:(
-                        end_position + 1)])
-                    cleaned_answer_text = " ".join(
-                        tokenization.whitespace_tokenize(orig_answer_text))
-                    if actual_text.find(cleaned_answer_text) == -1:
-                        print("Could not find answer: '%s' vs. '%s'",
-                              actual_text, cleaned_answer_text)
-                        continue
-                example = MRQAExample(
-                    qas_id=qas_id,
-                    question_text=question_text,
-                    doc_tokens=doc_tokens,
-                    orig_answer_text=orig_answer_text,
-                    start_position=start_position,
-                    end_position=end_position,
-                    is_impossible=is_impossible)
-                sampled_examples.append(example)
-    runtime_sample_rate = len(sampled_examples) / float(num_raw_examples)
-    runtime_samp_cnt = 0
-    for example in sampled_examples:
-        query_tokens = tokenizer.tokenize(example.question_text)
-        if len(query_tokens) > max_query_length:
-            query_tokens = query_tokens[0:max_query_length]
-        tok_to_orig_index = []
-        orig_to_tok_index = []
-        all_doc_tokens = []
-        for (i, token) in enumerate(example.doc_tokens):
-            orig_to_tok_index.append(len(all_doc_tokens))
-            sub_tokens = tokenizer.tokenize(token)
-            for sub_token in sub_tokens:
-                tok_to_orig_index.append(i)
-                all_doc_tokens.append(sub_token)
-        tok_start_position = None
-        tok_end_position = None
-        tok_start_position = orig_to_tok_index[example.start_position]
-        if example.end_position < len(example.doc_tokens) - 1:
-            tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
-        else:
-            tok_end_position = len(all_doc_tokens) - 1
-        (tok_start_position, tok_end_position) = _improve_answer_span(
-            all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
-            example.orig_answer_text)
-        # The -3 accounts for [CLS], [SEP] and [SEP]
-        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
-        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
-            "DocSpan", ["start", "length"])
-        doc_spans = []
-        start_offset = 0
-        while start_offset < len(all_doc_tokens):
-            length = len(all_doc_tokens) - start_offset
-            if length > max_tokens_for_doc:
-                length = max_tokens_for_doc
-            doc_spans.append(_DocSpan(start=start_offset, length=length))
-            if start_offset + length == len(all_doc_tokens):
-                break
-            start_offset += min(length, doc_stride)
-        for (doc_span_index, doc_span) in enumerate(doc_spans):
-            doc_start = doc_span.start
-            doc_end = doc_span.start + doc_span.length - 1
-            if filter_invalid_spans and not (tok_start_position >= doc_start and tok_end_position <= doc_end):
-                continue
-            runtime_samp_cnt += 1
-    return int(runtime_samp_cnt/runtime_sample_rate)
-def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
-                         orig_answer_text):
-    """Returns tokenized answer spans that better match the annotated answer."""
-    # The MRQA annotations are character based. We first project them to
-    # whitespace-tokenized words. But then after WordPiece tokenization, we can
-    # often find a "better match". For example:
-    #
-    #   Question: What year was John Smith born?
-    #   Context: The leader was John Smith (1895-1943).
-    #   Answer: 1895
-    #
-    # The original whitespace-tokenized answer will be "(1895-1943).". However
-    # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
-    # the exact answer, 1895.
-    #
-    # However, this is not always possible. Consider the following:
-    #
-    #   Question: What country is the top exporter of electornics?
-    #   Context: The Japanese electronics industry is the lagest in the world.
-    #   Answer: Japan
-    #
-    # In this case, the annotator chose "Japan" as a character sub-span of
-    # the word "Japanese". Since our WordPiece tokenizer does not split
-    # "Japanese", we just use "Japanese" as the annotation. This is fairly rare
-    # in MRQA, but does happen.
-    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
-    for new_start in range(input_start, input_end + 1):
-        for new_end in range(input_end, new_start - 1, -1):
-            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
-            if text_span == tok_answer_text:
-                return (new_start, new_end)
-    return (input_start, input_end)
-def _check_is_max_context(doc_spans, cur_span_index, position):
-    """Check if this is the 'max context' doc span for the token."""
-    # Because of the sliding window approach taken to scoring documents, a single
-    # token can appear in multiple documents. E.g.
-    #  Doc: the man went to the store and bought a gallon of milk
-    #  Span A: the man went to the
-    #  Span B: to the store and bought
-    #  Span C: and bought a gallon of
-    #  ...
-    #
-    # Now the word 'bought' will have two scores from spans B and C. We only
-    # want to consider the score with "maximum context", which we define as
-    # the *minimum* of its left and right context (the *sum* of left and
-    # right context will always be the same, of course).
-    #
-    # In the example the maximum context for 'bought' would be span C since
-    # it has 1 left context and 3 right context, while span B has 4 left context
-    # and 0 right context.
-    best_score = None
-    best_span_index = None
-    for (span_index, doc_span) in enumerate(doc_spans):
-        end = doc_span.start + doc_span.length - 1
-        if position < doc_span.start:
-            continue
-        if position > end:
-            continue
-        num_left_context = position - doc_span.start
-        num_right_context = end - position
-        score = min(num_left_context,
-                    num_right_context) + 0.01 * doc_span.length
-        if best_score is None or score > best_score:
-            best_score = score
-            best_span_index = span_index
-    return cur_span_index == best_span_index
-def get_final_text(pred_text, orig_text, do_lower_case, verbose):
-    """Project the tokenized prediction back to the original text."""
-    # When we created the data, we kept track of the alignment between original
-    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
-    # now `orig_text` contains the span of our original text corresponding to the
-    # span that we predicted.
-    #
-    # However, `orig_text` may contain extra characters that we don't want in
-    # our prediction.
-    #
-    # For example, let's say:
-    #   pred_text = steve smith
-    #   orig_text = Steve Smith's
-    #
-    # We don't want to return `orig_text` because it contains the extra "'s".
-    #
-    # We don't want to return `pred_text` because it's already been normalized
-    # (the MRQA eval script also does punctuation stripping/lower casing but
-    # our tokenizer does additional normalization like stripping accent
-    # characters).
-    #
-    # What we really want to return is "Steve Smith".
-    #
-    # Therefore, we have to apply a semi-complicated alignment heruistic between
-    # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
-    # can fail in certain cases in which case we just return `orig_text`.
-    def _strip_spaces(text):
-        ns_chars = []
-        ns_to_s_map = collections.OrderedDict()
-        for (i, c) in enumerate(text):
-            if c == " ":
-                continue
-            ns_to_s_map[len(ns_chars)] = i
-            ns_chars.append(c)
-        ns_text = "".join(ns_chars)
-        return (ns_text, ns_to_s_map)
-    # We first tokenize `orig_text`, strip whitespace from the result
-    # and `pred_text`, and check if they are the same length. If they are
-    # NOT the same length, the heuristic has failed. If they are the same
-    # length, we assume the characters are one-to-one aligned.
-    tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
-    tok_text = " ".join(tokenizer.tokenize(orig_text))
-    start_position = tok_text.find(pred_text)
-    if start_position == -1:
-        if verbose:
-            print("Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
-        return orig_text
-    end_position = start_position + len(pred_text) - 1
-    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
-    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
-    if len(orig_ns_text) != len(tok_ns_text):
-        if verbose:
-            print("Length not equal after stripping spaces: '%s' vs '%s'",
-                  orig_ns_text, tok_ns_text)
-        return orig_text
-    # We then project the characters in `pred_text` back to `orig_text` using
-    # the character-to-character alignment.
-    tok_s_to_ns_map = {}
-    for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
-        tok_s_to_ns_map[tok_index] = i
-    orig_start_position = None
-    if start_position in tok_s_to_ns_map:
-        ns_start_position = tok_s_to_ns_map[start_position]
-        if ns_start_position in orig_ns_to_s_map:
-            orig_start_position = orig_ns_to_s_map[ns_start_position]
-    if orig_start_position is None:
-        if verbose:
-            print("Couldn't map start position")
-        return orig_text
-    orig_end_position = None
-    if end_position in tok_s_to_ns_map:
-        ns_end_position = tok_s_to_ns_map[end_position]
-        if ns_end_position in orig_ns_to_s_map:
-            orig_end_position = orig_ns_to_s_map[ns_end_position]
-    if orig_end_position is None:
-        if verbose:
-            print("Couldn't map end position")
-        return orig_text
-    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
-    return output_text
-def _get_best_indexes(logits, n_best_size):
-    """Get the n-best logits from a list."""
-    index_and_score = sorted(
-        enumerate(logits), key=lambda x: x[1], reverse=True)
-    best_indexes = []
-    for i in range(len(index_and_score)):
-        if i >= n_best_size:
-            break
-        best_indexes.append(index_and_score[i][0])
-    return best_indexes
-def _compute_softmax(scores):
-    """Compute softmax probability over raw logits."""
-    if not scores:
-        return []
-    max_score = None
-    for score in scores:
-        if max_score is None or score > max_score:
-            max_score = score
-    exp_scores = []
-    total_sum = 0.0
-    for score in scores:
-        x = math.exp(score - max_score)
-        exp_scores.append(x)
-        total_sum += x
-    probs = []
-    for score in exp_scores:
-        probs.append(score / total_sum)
-    return probs
-if __name__ == '__main__':
-    train_file = 'data/mrqa-combined.all_dev.raw.json'
-    vocab_file = 'uncased_L-12_H-768_A-12/vocab.txt'
-    do_lower_case = True
-    tokenizer = tokenization.FullTokenizer(
-        vocab_file=vocab_file, do_lower_case=do_lower_case)
-    train_examples = read_mrqa_examples(
-        input_file=train_file, is_training=True)
-    print("begin converting")
-    for (index, feature) in enumerate(
-            convert_examples_to_features(
-                examples=train_examples,
-                tokenizer=tokenizer,
-                max_seq_length=384,
-                doc_stride=128,
-                max_query_length=64,
-                is_training=True,
-            )):
-        if index < 10:
-            print(index, feature.input_ids, feature.input_mask,
-                  feature.segment_ids)
--- a/run.sh
+++ b/run.sh
-#!/bin/bash
-# for gpu memory optimization
-export FLAGS_sync_nccl_allreduce=0
-export FLAGS_eager_delete_tensor_gb=1
-export CUDA_VISIBLE_DEVICES=0
-if [[ ! -d pretrain_model/bert ]]; then
-    bash download_pretrain.sh bert
-fi
-if [[ ! -d pretrain_model/ernie ]]; then
-    bash download_pretrain.sh ernie
-fi
-python -u mtl_run.py
--- a/run_demo.sh
+++ b/run_demo.sh
+export CUDA_VISIBLE_DEVICES=0
+export FLAGS_fraction_of_gpu_memory_to_use=0.1
+export FLAGS_eager_delete_tensor_gb=0
+python demo.py
--- a/script/convert_params.sh
+++ b/script/convert_params.sh
+#!/bin/sh
+if [[ $# != 1 ]]; then
+    echo "usage: bash convert_params.sh <params_dir>"
+    exit 1
+fi
+echo "converting..."
+cd $1
+mkdir .palm.backup
+for file in $(ls *)
+    do cp $file "backbone-"$file; mv $file .palm.backup
+done
+cd - >/dev/null
+echo "done!"
--- a/script/recover_params.sh
+++ b/script/recover_params.sh
+#!/bin/sh
+if [[ $# != 1 ]]; then
+    echo "usage: bash recover_params.sh <params_dir>"
+    exit 1
+fi
+rm $1/backbone-*
+mv $1/.palm.backup/* $1
+rm -rf $1/.palm.backup
--- a/setup.cfg
+++ b/setup.cfg
+[metadata]
+name = paddle-palm
+author = zhangyiming
+author_email = zhangyiming04@baidu.com
+version = 1.2
+description = Paddle-PALM
+long_description = file: README.md
+long_description_content_type = text/markdown
+home_page = https://github.com/PaddlePaddle/PALM
+license = Apache 2.0
+classifier =
+    Private :: Do Not Upload
+    Programming Language :: Python
+    Programming Language :: Python :: 2
+    Programming Language :: Python :: 2.7
+    Programming Language :: Python :: 3
+    Programming Language :: Python :: 3.5
+    Programming Language :: Python :: 3.6
+    Programming Language :: Python :: 3.7
+keywords =
+    paddlepaddle
+    paddle
+    multi-task-learning
+[options]
+packages = find:
+#install_requires =
+#    paddlepaddle-gpu >= 1.5.2
+include_package_data = True
+zip_safe = False
+[sdist]
+dist_dir = output/dist
+[bdist_wheel]
+dist_dir = output/dist
+[easy_install]
+index_url = http://pip.baidu.com/root/baidu/+simple/
--- a/setup.py
+++ b/setup.py
+# -*- coding: UTF-8 -*-
+################################################################################
+#
+#   Copyright (c) 2019  Baidu.com, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+"""
+Setup script.
+Authors: zhouxiangyang(zhouxiangyang@baidu.com)
+Date:    2019/09/29 21:00:01
+"""
+import setuptools
+with open("README.md", "r") as fh:
+    long_description = fh.read()
+setuptools.setup(
+    name="paddle-palm",
+    version="1.2",
+    author="PaddlePaddle",
+    author_email="zhangyiming04@baidu.com",
+    description="A Multi-task Learning Lib for PaddlePaddle Users.",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/PaddlePadd",
+    # packages=setuptools.find_packages(),
+    packages = ['paddlepalm', 
+        'paddlepalm.backbone', 
+        'paddlepalm.backbone.utils', 
+        'paddlepalm.optimizer',
+        'paddlepalm.reader', 
+        'paddlepalm.reader.utils', 
+        'paddlepalm.task_paradigm', 
+        'paddlepalm.tokenizer', 
+        'paddlepalm.utils'],
+    package_dir={'paddlepalm':'./paddlepalm',
+                 'paddlepalm.backbone':'./paddlepalm/backbone',
+                 'paddlepalm.backbone.utils':'./paddlepalm/backbone/utils',
+                 'paddlepalm.optimizer':'./paddlepalm/optimizer',
+                 'paddlepalm.reader':'./paddlepalm/reader',
+                 'paddlepalm.reader.utils':'./paddlepalm/reader/utils',
+                 'paddlepalm.task_paradigm':'./paddlepalm/task_paradigm',
+                 'paddlepalm.tokenizer':'./paddlepalm/tokenizer',
+                 'paddlepalm.utils':'./paddlepalm/utils'},
+    platforms = "any",
+    license='Apache 2.0',
+    classifiers = [
+            'License :: OSI Approved :: Apache Software License',
+            'Programming Language :: Python',
+            'Programming Language :: Python :: 2',
+            'Programming Language :: Python :: 2.7',
+            'Programming Language :: Python :: 3',
+            'Programming Language :: Python :: 3.5',
+            'Programming Language :: Python :: 3.6',
+            'Programming Language :: Python :: 3.7',
+          ],
+)
--- a/task_instance/match4mrqa.yaml
+++ b/task_instance/match4mrqa.yaml
+train_file: "data/match4mrqa/train.txt"
+reader: match4ernie
+paradigm: match
--- a/config/mask_language_model.yaml
+++ b/config/mask_language_model.yaml
--- a/task_instance/mrqa.yaml
+++ b/task_instance/mrqa.yaml
+train_file: data/mrqa/mrqa-combined.train.raw.json
+pred_file: data/mrqa/mrqa-combined.dev.raw.json
+pred_output_path: 'mrqa_output'
+reader: mrc4ernie
+paradigm: mrc
+doc_stride: 128
+max_query_len: 64
+max_answer_len: 30
+n_best_size: 20
+null_score_diff_threshold: 0.0
+verbose: False
--- a/utils/fp16.py
+++ b/utils/fp16.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# -*- coding: utf-8 -*-
-from __future__ import print_function
-import paddle
-import paddle.fluid as fluid
-def cast_fp16_to_fp32(i, o, prog):
-    prog.global_block().append_op(
-        type="cast",
-        inputs={"X": i},
-        outputs={"Out": o},
-        attrs={
-            "in_dtype": fluid.core.VarDesc.VarType.FP16,
-            "out_dtype": fluid.core.VarDesc.VarType.FP32
-        })
-def cast_fp32_to_fp16(i, o, prog):
-    prog.global_block().append_op(
-        type="cast",
-        inputs={"X": i},
-        outputs={"Out": o},
-        attrs={
-            "in_dtype": fluid.core.VarDesc.VarType.FP32,
-            "out_dtype": fluid.core.VarDesc.VarType.FP16
-        })
-def copy_to_master_param(p, block):
-    v = block.vars.get(p.name, None)
-    if v is None:
-        raise ValueError("no param name %s found!" % p.name)
-    new_p = fluid.framework.Parameter(
-        block=block,
-        shape=v.shape,
-        dtype=fluid.core.VarDesc.VarType.FP32,
-        type=v.type,
-        lod_level=v.lod_level,
-        stop_gradient=p.stop_gradient,
-        trainable=p.trainable,
-        optimize_attr=p.optimize_attr,
-        regularizer=p.regularizer,
-        gradient_clip_attr=p.gradient_clip_attr,
-        error_clip=p.error_clip,
-        name=v.name + ".master")
-    return new_p
-def create_master_params_grads(params_grads, main_prog, startup_prog,
-                               loss_scaling):
-    master_params_grads = []
-    tmp_role = main_prog._current_role
-    OpRole = fluid.core.op_proto_and_checker_maker.OpRole
-    main_prog._current_role = OpRole.Backward
-    for p, g in params_grads:
-        # create master parameters
-        master_param = copy_to_master_param(p, main_prog.global_block())
-        startup_master_param = startup_prog.global_block()._clone_variable(
-            master_param)
-        startup_p = startup_prog.global_block().var(p.name)
-        cast_fp16_to_fp32(startup_p, startup_master_param, startup_prog)
-        # cast fp16 gradients to fp32 before apply gradients
-        if g.name.find("layer_norm") > -1:
-            if loss_scaling > 1:
-                scaled_g = g / float(loss_scaling)
-            else:
-                scaled_g = g
-            master_params_grads.append([p, scaled_g])
-            continue
-        master_grad = fluid.layers.cast(g, "float32")
-        if loss_scaling > 1:
-            master_grad = master_grad / float(loss_scaling)
-        master_params_grads.append([master_param, master_grad])
-    main_prog._current_role = tmp_role
-    return master_params_grads
-def master_param_to_train_param(master_params_grads, params_grads, main_prog):
-    for idx, m_p_g in enumerate(master_params_grads):
-        train_p, _ = params_grads[idx]
-        if train_p.name.find("layer_norm") > -1:
-            continue
-        with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]):
-            cast_fp32_to_fp16(m_p_g[0], train_p, main_prog)
--- a/utils/placeholder.py
+++ b/utils/placeholder.py
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import os
-import six
-import ast
-import copy
-import numpy as np
-import paddle.fluid as fluid
-class Placeholder(object):
-    def __init__(self):
-        self.shapes = []
-        self.dtypes = []
-        self.lod_levels = []
-        self.names = []
-    def __init__(self, input_shapes):
-        self.shapes = []
-        self.dtypes = []
-        self.lod_levels = []
-        self.names = []
-        for new_holder in input_shapes:
-            shape = new_holder[0]
-            dtype = new_holder[1]
-            lod_level = new_holder[2] if len(new_holder) >= 3 else 0
-            name = new_holder[3] if len(new_holder) >= 4 else ""
-            self.append_placeholder(shape, dtype, lod_level = lod_level, name = name)
-    def append_placeholder(self, shape, dtype, lod_level = 0, name = ""):
-        self.shapes.append(shape)
-        self.dtypes.append(dtype)
-        self.lod_levels.append(lod_level)
-        self.names.append(name)
-    def build(self, capacity, reader_name, use_double_buffer = False):
-        pyreader = fluid.layers.py_reader(
-            capacity = capacity,
-            shapes = self.shapes,
-            dtypes = self.dtypes,
-            lod_levels = self.lod_levels,
-            name = reader_name, 
-            use_double_buffer = use_double_buffer)
-        return [pyreader, fluid.layers.read_file(pyreader)]
-    def __add__(self, new_holder):
-        assert isinstance(new_holder, tuple) or isinstance(new_holder, list) 
-        assert len(new_holder) >= 2
-        shape = new_holder[0]
-        dtype = new_holder[1]
-        lod_level = new_holder[2] if len(new_holder) >= 3 else 0
-        name = new_holder[3] if len(new_holder) >= 4 else ""
-        self.append_placeholder(shape, dtype, lod_level = lod_level, name = name)
-if __name__ == "__main__":
-    print("hello world!")