From ef8879f60cbbdb7dc4c22bc413c63737531b61f2 Mon Sep 17 00:00:00 2001
From: Meiyim <chen_xuyi@outlook.com>
Date: Thu, 21 May 2020 15:20:49 +0800
Subject: [PATCH] Dygraph fix2 (#455)

* remove redundent dep

* + github stale.yml

* update seq2seq

* remove paddle from `requirements.txt`; add paddle version check

* rename zh/en readme
---
 .github/stale.yml                             |  18 ++
 README.eng.md => README.en.md                 |  12 +-
 README.md                                     | 279 +----------------
 README.zh.md                                  | 282 ++++++++++++++++++
 ernie/__init__.py                             |   5 +
 experimental/seq2seq/decode.py                |   7 +-
 .../seq2seq/finetune_seq2seq_dygraph.py       |  15 +-
 experimental/seq2seq/modeling_ernie_gen.py    |   1 +
 requirements.txt                              |   6 +-
 setup.py                                      |   5 +-
 10 files changed, 331 insertions(+), 299 deletions(-)
 create mode 100644 .github/stale.yml
 rename README.eng.md => README.en.md (97%)
 mode change 100644 => 120000 README.md
 create mode 100644 README.zh.md

diff --git a/.github/stale.yml b/.github/stale.yml
new file mode 100644
index 0000000..652133a
--- /dev/null
+++ b/.github/stale.yml
@@ -0,0 +1,18 @@
+# Number of days of inactivity before an issue becomes stale
+daysUntilStale: 60
+# Number of days of inactivity before a stale issue is closed
+daysUntilClose: 7
+# Issues with these labels will never be considered stale
+exemptLabels:
+  - pinned
+  - security
+# Label to use when marking an issue as stale
+staleLabel: wontfix
+# Comment to post when marking an issue as stale. Set to `false` to disable
+markComment: >
+  This issue has been automatically marked as stale because it has not had
+  recent activity. It will be closed if no further activity occurs. Feel free to reopen it.
+  Thank you for your contributions.
+# Comment to post when closing a stale issue. Set to `false` to disable
+closeComment: false
+
diff --git a/README.eng.md b/README.en.md
similarity index 97%
rename from README.eng.md
rename to README.en.md
index 5d24c36..0a840d1 100644
--- a/README.eng.md
+++ b/README.en.md
@@ -1,4 +1,4 @@
-English|[简体中文](./README.md)
+English|[简体中文](./README.zh.md)
 
 ![./.metas/ERNIE_milestone.png](./.metas/ERNIE_milestone.png)
 
@@ -76,10 +76,14 @@ Don't have GPU? try ERNIE in [AIStudio](https://aistudio.baidu.com/aistudio/inde
 
 # Setup
 
-##### 1. install ernie
+##### 1. install PaddlePaddle
+
+This repo requires PaddlePaddle 1.7.0+, please see [here](https://www.paddlepaddle.org.cn/install/quick) for installaton instruction.
+
+##### 2. install ernie
 
 ```script
-pip install paddle-ernie==0.0.1.dev1
+pip install paddle-ernie
 ```
 
 or 
@@ -87,7 +91,7 @@ or
 ```shell
 git clone -b dygraph https://github.com/PaddlePaddle/ERNIE.git --single-branch
 cd ERNIE
-pip install -r requirement.txt
+pip install -r requirements.txt
 pip setup.py -e .
 
 ```
diff --git a/README.md b/README.md
deleted file mode 100644
index 62f8749..0000000
--- a/README.md
+++ /dev/null
@@ -1,278 +0,0 @@
-[English](./README.eng.md)|简体中文
-
-![./.metas/ERNIE_milestone.png](./.metas/ERNIE_milestone.png)
-
-ERNIE是百度开创性提出的基于知识增强的持续学习语义理解框架，该框架将大数据预训练与多源丰富知识相结合，通过持续学习技术，不断吸收海量文本数据中词汇、结构、语义等方面的知识，实现模型效果不断进化。ERNIE在情感分析、文本匹配、自然语言推理、词法分析、阅读理解、智能问答等16个公开数据集上全面显著超越世界领先技术，在国际权威的通用语言理解评估基准GLUE上，得分首次突破90分，获得全球第一。在今年3月落下帷幕的全球最大语义评测SemEval 2020上，ERNIE摘得5项世界冠军， 该技术也被全球顶级科技商业杂志《麻省理工科技评论》官方网站报道，相关创新成果也被国际顶级学术会议AAAI、IJCAI收录。ERNIE在工业界得到了大规模应用，如搜索引擎、新闻推荐、广告系统、语音交互、智能客服等。
-
-**提醒: ERNIE老版本代码已经迁移至[repro分支](https://github.com/PaddlePaddle/ERNIE/tree/repro)，欢迎使用我们全新升级的基于动静结合的新版ERNIE套件进行开发。另外，也欢迎上[EasyDL](https://ai.baidu.com/easydl/pro)体验更丰富的功能（如ERNIE 2.0、ERNIE 2.1、ERNIE领域模型等）。**
-
-
-# 新闻
-
-- 2020.5.20:     
-    - 欢迎试用`动态图`实现的 ERNIE:
-        - 基于[PaddlePaddle v1.8](https://github.com/PaddlePaddle/Paddle/tree/release/1.8)使用 ERNIE 进行 Pretrain 和 Finetune.
-        - 动态执行, 所见即所得。
-        - 大规模分布式训练。
-        - 易于部署。
-        - 通过Aistudio 教程快速入门NLP。
-        - 向后兼容老版 checkpoint。
-    -  `ERNIE-GEN` 模型正式开源! ([点击进入](https://github.com/PaddlePaddle/ERNIE/tree/repro/ernie-gen))
-        - 最强文本生成预训练模型正式开源，相关工作已被 `IJCAI-2020` 收录。
-            - 首次把 ERNIE 预训练技术能力扩展至文本生成领域，在多个典型任务上取得最佳。
-            - 您现在即可下载论文报告的所有模型（包含 [`base/large/large-160G`](https://github.com/PaddlePaddle/ERNIE/tree/repro/ernie-gen/README.zh.md#预训练模型)）。
-        - 首次在预训练阶段加入span-by-span 生成任务，让模型每次能够生成一个语义完整的片段。
-        - 提出填充式生成机制和噪声感知机制来缓解曝光偏差问题。
-        - 精巧的 Mulit-Flow Attention 实现框架。
-- 2020.4.30 发布[ERNIESage](https://github.com/PaddlePaddle/PGL/tree/master/examples/erniesage)， 一种新型图神经网络模型，采用ERNIE做为aggreagtor. 由[PGL](https://github.com/PaddlePaddle/PGL)实现。
-- 2020.3.27 [在SemEval2020五项子任务上夺冠](https://www.jiqizhixin.com/articles/2020-03-27-8)。
-- 2019.12.26 [GLUE榜第一名](https://www.technologyreview.com/2019/12/26/131372/ai-baidu-ernie-google-bert-natural-language-glue/)。
-- 2019.11.6 发布[ERNIE Tiny](https://www.jiqizhixin.com/articles/2019-11-06-9)。
-- 2019.7.7 发布[ERNIE 2.0](https://www.jiqizhixin.com/articles/2019-07-31-10)。
-- 2019.3.16 发布[ERNIE 1.0](https://www.jiqizhixin.com/articles/2019-03-16-3)。
-
-
-# 导航
-
-* [教程](#教程)
-* [安装](#安装)
-* [支持的NLP任务](#支持的nlp任务)
-* [预训练(ERNIE 1.0)](#预训练-ernie-10)
-* [在线预测](#在线预测)
-* [蒸馏](#蒸馏)
-
-# 快速上手
-```python
-import numpy as np
-import paddle.fluid.dygraph as D
-from ernie.tokenizing_ernie import ErnieTokenizer
-from ernie.modeling_ernie import ErnieModel
-
-D.guard().__enter__() # activate paddle `dygrpah` mode
-
-model = ErnieModel.from_pretrained('ernie-1.0')    # Try to get pretrained model from server, make sure you have network connection
-tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0')
-
-ids, _ = tokenizer.encode('hello world')
-ids = D.to_variable(np.expand_dims(ids, 0))  # insert extra `batch` dimension
-pooled, encoded = model(ids)                 # eager execution
-print(pooled.numpy())                        # convert  results to numpy
-
-```
-
-# 教程
-
-手边没有GPU？欢迎在[AIStudio](https://aistudio.baidu.com/aistudio/index)中直接试用 ERNIE. 
-(请选择最新版本的教程并申请GPU运行环境)
-
-1. [从0开始学ERNIE](https://aistudio.baidu.com/studio/edu/group/quick/join/314947)
-1. [情感识别](https://aistudio.baidu.com/aistudio/projectdetail/427482)
-2. [完形填空](https://aistudio.baidu.com/aistudio/projectdetail/433491)
-3. [知识蒸馏](https://aistudio.baidu.com/aistudio/projectdetail/439460)
-4. [万事不决问ERNIE](https://aistudio.baidu.com/aistudio/projectdetail/456443)
-
-# 安装
-
-##### 1. 安装 ERNIE
-
-
-```script
-pip install paddle-ernie==0.0.1.dev1
-```
-
-或者
-
-```shell
-git clone -b dygraph https://github.com/PaddlePaddle/ERNIE.git --single-branch
-cd ERNIE
-pip install -r requirement.txt
-pip setup.py -e .
-
-```
-
-##### 3. 下载预训练模型（可选）
-
-
-| Model                                              | Description                                                  |
-| :------------------------------------------------- | :----------------------------------------------------------- |
-| [ERNIE 1.0 Base 中文](https://ernie-github.cdn.bcebos.com/model-ernie1.0.1.tar.gz)           | L12H768A12  |
-| [ERNIE Tiny](https://ernie-github.cdn.bcebos.com/model-ernie_tiny.1.tar.gz)                 | L3H1024A16      |
-| [ERNIE 2.0 Base 英文](https://ernie-github.cdn.bcebos.com/model-ernie2.0-en.1.tar.gz)        | base: L12H768A12  |
-| [ERNIE 2.0 Large 英文](https://ernie-github.cdn.bcebos.com/model-ernie2.0-large-en.1.tar.gz) | large: L24H1024A16|
-| [ERNIE Gen base 英文](https://ernie-github.cdn.bcebos.com/model-ernie-gen-base-en.1.tar.gz)  | L12H768A12  |
-| [ERNIE Gen Large 英文](https://ernie-github.cdn.bcebos.com/model-ernie-gen-large-en.1.tar.gz)| L24H1024A16 |
-
-##### 4. 下载数据集
-
-
-**英文数据集**
-
-运行[此](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)脚本，下载[GLUE datasets](https://gluebenchmark.com/tasks).
-
-请将数据目录整理成以下格式，方便在后续 demo 教程中使用（通过`--data_dir`参数将数据路径传入训练脚本）；
-
-```shell
-data/xnli
-├── dev
-│   └── 1
-├── test
-│   └── 1
-└── train
-    └── 1
-```
-
-[示例](https://ernie-github.cdn.bcebos.com/data-mnli-m.tar.gz)数据（MNLI任务测试、训练集合）。
-
-
-**中文数据**
-
-| 数据集|描述|
-|:--------|:----------|
-| [XNLI](https://ernie-github.cdn.bcebos.com/data-xnli.tar.gz)                 |XNLI 是由 Facebook 和纽约大学的研究者联合构建的自然语言推断数据集，包括 15 种语言的数据。我们用其中的中文数据来评估模型的语言理解能力。[链接](https://github.com/facebookresearch/XNLI)|
-| [ChnSentiCorp](https://ernie-github.cdn.bcebos.com/data-chnsenticorp.tar.gz) |ChnSentiCorp 是一个中文情感分析数据集，包含酒店、笔记本电脑和书籍的网购评论。|
-| [MSRA-NER](https://ernie-github.cdn.bcebos.com/data-msra_ner.tar.gz)         |MSRA-NER (SIGHAN2006) 数据集由微软亚研院发布，其目标是识别文本中具有特定意义的实体，包括人名、地名、机构名。|
-| [NLPCC2016-DBQA](https://ernie-github.cdn.bcebos.com/data-dbqa.tar.gz)       |NLPCC2016-DBQA 是由国际自然语言处理和中文计算会议 NLPCC 于 2016 年举办的评测任务，其目标是从候选中找到合适的文档作为问题的答案。[链接](http://tcci.ccf.org.cn/conference/2016/dldoc/evagline2.pdf)|
-|[CMRC2018](https://ernie-github.cdn.bcebos.com/data-cmrc2018.tar.gz)|CMRC2018 是中文信息学会举办的评测，评测的任务是抽取类阅读理解。[链接](https://github.com/ymcui/cmrc2018)
-
-
-# 支持的NLP任务
-
-- 使用 `动态图` 模型进行finetune:
-
-```script
-python3 ./demo/finetune_classifier_dygraph.py \
-    --from_pretrained ernie_1.0 \
-    --data_dir ./data/xnli
-```
-
-- 分布式 finetune
-
-`paddle.distributed.launch` 是一个进程管理器，我们采用它在每一张GPU上启动一个python进程，并配置相应的环境变量以进行分布式训练:
-
-当采用分布式训练时，我们采用`max_steps`做为终止条件而非`epoch`, 这样处理是为了避免进程间死锁。
-另外值得注意的是训练集需要在不同的进程间进行切分；以避免所有进程训练同一份数据造成的过拟合。
-
-示例脚本（请确保你有两张以上GPU卡）:
-
-```script
-python3 -m paddle.distributed.launch \
-./demo/finetune_classifier_dygraph_distributed.py \
-    --data_dir data/mnli \
-    --max_steps 10000 \
-    --from_pretrained ernie2.0-en
-```
-
-
-更多示例脚本:
-
-1. [情感分析](./demo/finetune_sentiment_analysis_dygraph.py)
-1. [语义匹配](./demo/finetune_classifier_dygraph.py)
-1. [命名实体识别(NER)](./demo/finetune_ner_dygraph.py)
-1. [机器阅读理解](./demo/finetune_mrc_dygraph.py)
-1. [文本摘要生成](./experimental/seq2seq/README.md)
-
-
-**推荐超参数设置：**
-
-|任务|batch size|learning rate|
-|--|--|--|
-| CoLA         | 32 / 64 (base)  | 3e-5                     |
-| SST-2       
- | 64 / 256 (base) | 2e-5                     |
-| STS-B        | 128             | 5e-5                     |
-| QQP          | 256             | 3e-5(base)/5e-5(large)   |
-| MNLI         | 256 / 512 (base)| 3e-5                     |
-| QNLI         | 256             | 2e-5                     |
-| RTE          | 16 / 4 (base)   | 2e-5(base)/3e-5(large)   |
-| MRPC         | 16 / 32 (base)  | 3e-5                     |
-| WNLI         | 8               | 2e-5                     |
-| XNLI         | 512             | 1e-4(base)/4e-5(large)   |
-| CMRC2018     | 64              | 3e-5                     |
-| DRCD         | 64              | 5e-5(base)/3e-5(large)   |
-| MSRA-NER(SIGHAN2006)  | 16     | 5e-5(base)/1e-5(large)   |
-| ChnSentiCorp | 24              | 5e-5(base)/1e-5(large)   |
-| LCQMC        | 32              | 2e-5(base)/5e-6(large)   |
-| NLPCC2016-DBQA| 64             | 2e-5(base)/1e-5(large)   |
-
-# 预训练 (ERNIE 1.0)
-
-请见[这里](./demo/pretrain/README.md)
-
-# 在线预测
-
-如果`finetune_classifier_dygraph.py`中指定了`--inference_model_dir`参数，funetune脚本会将你的模型序列化并产出可以直接部署线上预测的`inference_model`.
-
-关于生产环境中使用线上预测代码的实现细节，请见[C++ inference API](./inference/README.md).
-或者你可以使用`propeller`启动一个多GPU预测服务(需要GPU环境)，只需执行：
-
-```shell
-python -m propeller.tools.start_server -m /path/to/saved/inference_model  -p 8881
-```
-
-即可启动预测服务；随后在Python端采用如下命令访问该服务(仅限 python3):
-
-```python
-from propeller.service.client import InferenceClient
-from ernie.tokenizing_ernie import ErnieTokenizer
-
-client = InferenceClient('tcp://localhost:8881')
-tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0')
-ids, sids = tokenizer.encode('hello world')
-ids = np.expand_dims(ids, 0)
-sids = np.expand_dims(sids, 0)
-result = client(ids, sids)
-```
-
-你也可从[此处]((https://ernie.bj.bcebos.com/ernie1.0_zh_inference_model.tar.gz).)下载一个预先制作好的ernie-1.0 base模型的 `inference_model`.
-该模型没有经过finetune，一般可以用做上层模型结构的 feature-base finetune或者做为一个文本特征抽取器。
-因为该模行由老版API 产出，在进行客户端请求时需要在输入tensor后面追加一个维度：
-
-```python3
-ids = np.expand_dims(ids, -1) # ids.shape==[BATCH, SEQLEN, 1]
-```
-
-# 蒸馏
-
-知识蒸馏是进行ERNIE模型压缩、加速的有效方式；关于知识蒸馏的实现细节请参见[这里](./distill/README.md)。
-
-# 文献引用
-
-### ERNIE 1.0
-```
-@article{sun2019ernie,
-  title={Ernie: Enhanced representation through knowledge integration},
-  author={Sun, Yu and Wang, Shuohuan and Li, Yukun and Feng, Shikun and Chen, Xuyi and Zhang, Han and Tian, Xin and Zhu, Danxiang and Tian, Hao and Wu, Hua},
-  journal={arXiv preprint arXiv:1904.09223},
-  year={2019}
-}
-```
-
-### ERNIE 2.0
-```
-@article{sun2019ernie20,
-  title={ERNIE 2.0: A Continual Pre-training Framework for Language Understanding},
-  author={Sun, Yu and Wang, Shuohuan and Li, Yukun and Feng, Shikun and Tian, Hao and Wu, Hua and Wang, Haifeng},
-  journal={arXiv preprint arXiv:1907.12412},
-  year={2019} 
-}
-```
-
-### ERNIE-GEN
-
-```
-@article{xiao2020ernie-gen,
-  title={ERNIE-GEN: An Enhanced Multi-Flow Pre-training and Fine-tuning Framework for Natural Language Generation},
-  author={Xiao, Dongling and Zhang, Han and Li, Yukun and Sun, Yu and Tian, Hao and Wu, Hua and Wang, Haifeng},
-  journal={arXiv preprint arXiv:2001.11314},
-  year={2020}
-}
-```
-
-若希望复现 paper 中的所有实验，请切换至本repo的`repro`分支。
-
-### 讨论组
-- [ERNIE官方主页](https://www.paddlepaddle.org.cn/ernie)
-- [Github Issues](https://github.com/PaddlePaddle/ERNIE/issues): bug reports, feature requests, install issues, usage issues, etc.
-- QQ 群: 760439550 (ERNIE discussion group).
-- [Forums](http://ai.baidu.com/forum/topic/list/168?pageNo=1): discuss implementations, research, etc.
-
diff --git a/README.md b/README.md
new file mode 120000
index 0000000..082ec95
--- /dev/null
+++ b/README.md
@@ -0,0 +1 @@
+README.zh.md
\ No newline at end of file
diff --git a/README.zh.md b/README.zh.md
new file mode 100644
index 0000000..55d8f20
--- /dev/null
+++ b/README.zh.md
@@ -0,0 +1,282 @@
+[English](./README.en.md)|简体中文
+
+![./.metas/ERNIE_milestone.png](./.metas/ERNIE_milestone.png)
+
+ERNIE是百度开创性提出的基于知识增强的持续学习语义理解框架，该框架将大数据预训练与多源丰富知识相结合，通过持续学习技术，不断吸收海量文本数据中词汇、结构、语义等方面的知识，实现模型效果不断进化。ERNIE在情感分析、文本匹配、自然语言推理、词法分析、阅读理解、智能问答等16个公开数据集上全面显著超越世界领先技术，在国际权威的通用语言理解评估基准GLUE上，得分首次突破90分，获得全球第一。在今年3月落下帷幕的全球最大语义评测SemEval 2020上，ERNIE摘得5项世界冠军， 该技术也被全球顶级科技商业杂志《麻省理工科技评论》官方网站报道，相关创新成果也被国际顶级学术会议AAAI、IJCAI收录。ERNIE在工业界得到了大规模应用，如搜索引擎、新闻推荐、广告系统、语音交互、智能客服等。
+
+**提醒: ERNIE老版本代码已经迁移至[repro分支](https://github.com/PaddlePaddle/ERNIE/tree/repro)，欢迎使用我们全新升级的基于动静结合的新版ERNIE套件进行开发。另外，也欢迎上[EasyDL](https://ai.baidu.com/easydl/pro)体验更丰富的功能（如ERNIE 2.0、ERNIE 2.1、ERNIE领域模型等）。**
+
+
+# 新闻
+
+- 2020.5.20:     
+    - 欢迎试用`动态图`实现的 ERNIE:
+        - 基于[PaddlePaddle v1.8](https://github.com/PaddlePaddle/Paddle/tree/release/1.8)使用 ERNIE 进行 Pretrain 和 Finetune.
+        - 动态执行, 所见即所得。
+        - 大规模分布式训练。
+        - 易于部署。
+        - 通过Aistudio 教程快速入门NLP。
+        - 向后兼容老版 checkpoint。
+    -  `ERNIE-GEN` 模型正式开源! ([点击进入](https://github.com/PaddlePaddle/ERNIE/tree/repro/ernie-gen))
+        - 最强文本生成预训练模型正式开源，相关工作已被 `IJCAI-2020` 收录。
+            - 首次把 ERNIE 预训练技术能力扩展至文本生成领域，在多个典型任务上取得最佳。
+            - 您现在即可下载论文报告的所有模型（包含 [`base/large/large-160G`](https://github.com/PaddlePaddle/ERNIE/tree/repro/ernie-gen/README.zh.md#预训练模型)）。
+        - 首次在预训练阶段加入span-by-span 生成任务，让模型每次能够生成一个语义完整的片段。
+        - 提出填充式生成机制和噪声感知机制来缓解曝光偏差问题。
+        - 精巧的 Mulit-Flow Attention 实现框架。
+- 2020.4.30 发布[ERNIESage](https://github.com/PaddlePaddle/PGL/tree/master/examples/erniesage)， 一种新型图神经网络模型，采用ERNIE做为aggreagtor. 由[PGL](https://github.com/PaddlePaddle/PGL)实现。
+- 2020.3.27 [在SemEval2020五项子任务上夺冠](https://www.jiqizhixin.com/articles/2020-03-27-8)。
+- 2019.12.26 [GLUE榜第一名](https://www.technologyreview.com/2019/12/26/131372/ai-baidu-ernie-google-bert-natural-language-glue/)。
+- 2019.11.6 发布[ERNIE Tiny](https://www.jiqizhixin.com/articles/2019-11-06-9)。
+- 2019.7.7 发布[ERNIE 2.0](https://www.jiqizhixin.com/articles/2019-07-31-10)。
+- 2019.3.16 发布[ERNIE 1.0](https://www.jiqizhixin.com/articles/2019-03-16-3)。
+
+
+# 导航
+
+* [教程](#教程)
+* [安装](#安装)
+* [支持的NLP任务](#支持的nlp任务)
+* [预训练(ERNIE 1.0)](#预训练-ernie-10)
+* [在线预测](#在线预测)
+* [蒸馏](#蒸馏)
+
+# 快速上手
+```python
+import numpy as np
+import paddle.fluid.dygraph as D
+from ernie.tokenizing_ernie import ErnieTokenizer
+from ernie.modeling_ernie import ErnieModel
+
+D.guard().__enter__() # activate paddle `dygrpah` mode
+
+model = ErnieModel.from_pretrained('ernie-1.0')    # Try to get pretrained model from server, make sure you have network connection
+tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0')
+
+ids, _ = tokenizer.encode('hello world')
+ids = D.to_variable(np.expand_dims(ids, 0))  # insert extra `batch` dimension
+pooled, encoded = model(ids)                 # eager execution
+print(pooled.numpy())                        # convert  results to numpy
+
+```
+
+# 教程
+
+手边没有GPU？欢迎在[AIStudio](https://aistudio.baidu.com/aistudio/index)中直接试用 ERNIE. 
+(请选择最新版本的教程并申请GPU运行环境)
+
+1. [从0开始学ERNIE](https://aistudio.baidu.com/studio/edu/group/quick/join/314947)
+1. [情感识别](https://aistudio.baidu.com/aistudio/projectdetail/427482)
+2. [完形填空](https://aistudio.baidu.com/aistudio/projectdetail/433491)
+3. [知识蒸馏](https://aistudio.baidu.com/aistudio/projectdetail/439460)
+4. [万事不决问ERNIE](https://aistudio.baidu.com/aistudio/projectdetail/456443)
+
+# 安装
+
+##### 1. 安装 PaddlePaddle
+
+本项目依赖PaddlePaddle 1.7.0+， 请参考[这里](https://www.paddlepaddle.org.cn/install/quick)安装 PaddlePaddle。
+
+##### 2. 安装 ERNIE
+
+
+```script
+pip install paddle-ernie
+```
+
+或者
+
+```shell
+git clone -b dygraph https://github.com/PaddlePaddle/ERNIE.git --single-branch
+cd ERNIE
+pip install -r requirements.txt
+pip setup.py -e .
+
+```
+
+##### 3. 下载预训练模型（可选）
+
+
+| Model                                              | Description                                                  |
+| :------------------------------------------------- | :----------------------------------------------------------- |
+| [ERNIE 1.0 Base 中文](https://ernie-github.cdn.bcebos.com/model-ernie1.0.1.tar.gz)           | L12H768A12  |
+| [ERNIE Tiny](https://ernie-github.cdn.bcebos.com/model-ernie_tiny.1.tar.gz)                 | L3H1024A16      |
+| [ERNIE 2.0 Base 英文](https://ernie-github.cdn.bcebos.com/model-ernie2.0-en.1.tar.gz)        | base: L12H768A12  |
+| [ERNIE 2.0 Large 英文](https://ernie-github.cdn.bcebos.com/model-ernie2.0-large-en.1.tar.gz) | large: L24H1024A16|
+| [ERNIE Gen base 英文](https://ernie-github.cdn.bcebos.com/model-ernie-gen-base-en.1.tar.gz)  | L12H768A12  |
+| [ERNIE Gen Large 英文](https://ernie-github.cdn.bcebos.com/model-ernie-gen-large-en.1.tar.gz)| L24H1024A16 |
+
+##### 4. 下载数据集
+
+
+**英文数据集**
+
+运行[此](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)脚本，下载[GLUE datasets](https://gluebenchmark.com/tasks).
+
+请将数据目录整理成以下格式，方便在后续 demo 教程中使用（通过`--data_dir`参数将数据路径传入训练脚本）；
+
+```shell
+data/xnli
+├── dev
+│   └── 1
+├── test
+│   └── 1
+└── train
+    └── 1
+```
+
+[示例](https://ernie-github.cdn.bcebos.com/data-mnli-m.tar.gz)数据（MNLI任务测试、训练集合）。
+
+
+**中文数据**
+
+| 数据集|描述|
+|:--------|:----------|
+| [XNLI](https://ernie-github.cdn.bcebos.com/data-xnli.tar.gz)                 |XNLI 是由 Facebook 和纽约大学的研究者联合构建的自然语言推断数据集，包括 15 种语言的数据。我们用其中的中文数据来评估模型的语言理解能力。[链接](https://github.com/facebookresearch/XNLI)|
+| [ChnSentiCorp](https://ernie-github.cdn.bcebos.com/data-chnsenticorp.tar.gz) |ChnSentiCorp 是一个中文情感分析数据集，包含酒店、笔记本电脑和书籍的网购评论。|
+| [MSRA-NER](https://ernie-github.cdn.bcebos.com/data-msra_ner.tar.gz)         |MSRA-NER (SIGHAN2006) 数据集由微软亚研院发布，其目标是识别文本中具有特定意义的实体，包括人名、地名、机构名。|
+| [NLPCC2016-DBQA](https://ernie-github.cdn.bcebos.com/data-dbqa.tar.gz)       |NLPCC2016-DBQA 是由国际自然语言处理和中文计算会议 NLPCC 于 2016 年举办的评测任务，其目标是从候选中找到合适的文档作为问题的答案。[链接](http://tcci.ccf.org.cn/conference/2016/dldoc/evagline2.pdf)|
+|[CMRC2018](https://ernie-github.cdn.bcebos.com/data-cmrc2018.tar.gz)|CMRC2018 是中文信息学会举办的评测，评测的任务是抽取类阅读理解。[链接](https://github.com/ymcui/cmrc2018)
+
+
+# 支持的NLP任务
+
+- 使用 `动态图` 模型进行finetune:
+
+```script
+python3 ./demo/finetune_classifier_dygraph.py \
+    --from_pretrained ernie_1.0 \
+    --data_dir ./data/xnli
+```
+
+- 分布式 finetune
+
+`paddle.distributed.launch` 是一个进程管理器，我们采用它在每一张GPU上启动一个python进程，并配置相应的环境变量以进行分布式训练:
+
+当采用分布式训练时，我们采用`max_steps`做为终止条件而非`epoch`, 这样处理是为了避免进程间死锁。
+另外值得注意的是训练集需要在不同的进程间进行切分；以避免所有进程训练同一份数据造成的过拟合。
+
+示例脚本（请确保你有两张以上GPU卡）:
+
+```script
+python3 -m paddle.distributed.launch \
+./demo/finetune_classifier_dygraph_distributed.py \
+    --data_dir data/mnli \
+    --max_steps 10000 \
+    --from_pretrained ernie2.0-en
+```
+
+
+更多示例脚本:
+
+1. [情感分析](./demo/finetune_sentiment_analysis_dygraph.py)
+1. [语义匹配](./demo/finetune_classifier_dygraph.py)
+1. [命名实体识别(NER)](./demo/finetune_ner_dygraph.py)
+1. [机器阅读理解](./demo/finetune_mrc_dygraph.py)
+1. [文本摘要生成](./experimental/seq2seq/README.md)
+
+
+**推荐超参数设置：**
+
+|任务|batch size|learning rate|
+|--|--|--|
+| CoLA         | 32 / 64 (base)  | 3e-5                     |
+| SST-2       
+ | 64 / 256 (base) | 2e-5                     |
+| STS-B        | 128             | 5e-5                     |
+| QQP          | 256             | 3e-5(base)/5e-5(large)   |
+| MNLI         | 256 / 512 (base)| 3e-5                     |
+| QNLI         | 256             | 2e-5                     |
+| RTE          | 16 / 4 (base)   | 2e-5(base)/3e-5(large)   |
+| MRPC         | 16 / 32 (base)  | 3e-5                     |
+| WNLI         | 8               | 2e-5                     |
+| XNLI         | 512             | 1e-4(base)/4e-5(large)   |
+| CMRC2018     | 64              | 3e-5                     |
+| DRCD         | 64              | 5e-5(base)/3e-5(large)   |
+| MSRA-NER(SIGHAN2006)  | 16     | 5e-5(base)/1e-5(large)   |
+| ChnSentiCorp | 24              | 5e-5(base)/1e-5(large)   |
+| LCQMC        | 32              | 2e-5(base)/5e-6(large)   |
+| NLPCC2016-DBQA| 64             | 2e-5(base)/1e-5(large)   |
+
+# 预训练 (ERNIE 1.0)
+
+请见[这里](./demo/pretrain/README.md)
+
+# 在线预测
+
+如果`finetune_classifier_dygraph.py`中指定了`--inference_model_dir`参数，funetune脚本会将你的模型序列化并产出可以直接部署线上预测的`inference_model`.
+
+关于生产环境中使用线上预测代码的实现细节，请见[C++ inference API](./inference/README.md).
+或者你可以使用`propeller`启动一个多GPU预测服务(需要GPU环境)，只需执行：
+
+```shell
+python -m propeller.tools.start_server -m /path/to/saved/inference_model  -p 8881
+```
+
+即可启动预测服务；随后在Python端采用如下命令访问该服务(仅限 python3):
+
+```python
+from propeller.service.client import InferenceClient
+from ernie.tokenizing_ernie import ErnieTokenizer
+
+client = InferenceClient('tcp://localhost:8881')
+tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0')
+ids, sids = tokenizer.encode('hello world')
+ids = np.expand_dims(ids, 0)
+sids = np.expand_dims(sids, 0)
+result = client(ids, sids)
+```
+
+你也可从[此处]((https://ernie.bj.bcebos.com/ernie1.0_zh_inference_model.tar.gz).)下载一个预先制作好的ernie-1.0 base模型的 `inference_model`.
+该模型没有经过finetune，一般可以用做上层模型结构的 feature-base finetune或者做为一个文本特征抽取器。
+因为该模行由老版API 产出，在进行客户端请求时需要在输入tensor后面追加一个维度：
+
+```python3
+ids = np.expand_dims(ids, -1) # ids.shape==[BATCH, SEQLEN, 1]
+```
+
+# 蒸馏
+
+知识蒸馏是进行ERNIE模型压缩、加速的有效方式；关于知识蒸馏的实现细节请参见[这里](./distill/README.md)。
+
+# 文献引用
+
+### ERNIE 1.0
+```
+@article{sun2019ernie,
+  title={Ernie: Enhanced representation through knowledge integration},
+  author={Sun, Yu and Wang, Shuohuan and Li, Yukun and Feng, Shikun and Chen, Xuyi and Zhang, Han and Tian, Xin and Zhu, Danxiang and Tian, Hao and Wu, Hua},
+  journal={arXiv preprint arXiv:1904.09223},
+  year={2019}
+}
+```
+
+### ERNIE 2.0
+```
+@article{sun2019ernie20,
+  title={ERNIE 2.0: A Continual Pre-training Framework for Language Understanding},
+  author={Sun, Yu and Wang, Shuohuan and Li, Yukun and Feng, Shikun and Tian, Hao and Wu, Hua and Wang, Haifeng},
+  journal={arXiv preprint arXiv:1907.12412},
+  year={2019} 
+}
+```
+
+### ERNIE-GEN
+
+```
+@article{xiao2020ernie-gen,
+  title={ERNIE-GEN: An Enhanced Multi-Flow Pre-training and Fine-tuning Framework for Natural Language Generation},
+  author={Xiao, Dongling and Zhang, Han and Li, Yukun and Sun, Yu and Tian, Hao and Wu, Hua and Wang, Haifeng},
+  journal={arXiv preprint arXiv:2001.11314},
+  year={2020}
+}
+```
+
+若希望复现 paper 中的所有实验，请切换至本repo的`repro`分支。
+
+### 讨论组
+- [ERNIE官方主页](https://www.paddlepaddle.org.cn/ernie)
+- [Github Issues](https://github.com/PaddlePaddle/ERNIE/issues): bug reports, feature requests, install issues, usage issues, etc.
+- QQ 群: 760439550 (ERNIE discussion group).
+- [Forums](http://ai.baidu.com/forum/topic/list/168?pageNo=1): discuss implementations, research, etc.
+
diff --git a/ernie/__init__.py b/ernie/__init__.py
index 35fb96e..db8bc37 100644
--- a/ernie/__init__.py
+++ b/ernie/__init__.py
@@ -17,6 +17,11 @@ from __future__ import absolute_import
 from __future__ import print_function
 from __future__ import unicode_literals
 
+import paddle
+paddle_version = [int(i) for i in paddle.__version__.split('.')]
+if paddle_version[1] < 7:
+    raise RuntimeError('paddle-ernie requires paddle 1.7+, got %s' %
+                       paddle.__version__)
 
 from ernie.modeling_ernie import ErnieModel
 from ernie.modeling_ernie import (ErnieModelForSequenceClassification, 
diff --git a/experimental/seq2seq/decode.py b/experimental/seq2seq/decode.py
index 6279b6d..a048548 100644
--- a/experimental/seq2seq/decode.py
+++ b/experimental/seq2seq/decode.py
@@ -194,7 +194,7 @@ def beam_search_step(state, logits, eos_id, beam_width, is_first_step):
 
 
 @D.no_grad
-def beam_search_infilling(model, q_ids, q_sids, sos_id, eos_id, attn_id, max_encode_len=640, max_decode_len=100, beam_width=5,):
+def beam_search_infilling(model, q_ids, q_sids, sos_id, eos_id, attn_id, max_encode_len=640, max_decode_len=100, beam_width=5, tgt_type_id=3):
     model.eval()
     #log.debug(q_ids.numpy().tolist())
     _, __, info = model(q_ids, q_sids)
@@ -228,7 +228,7 @@ def beam_search_infilling(model, q_ids, q_sids, sos_id, eos_id, attn_id, max_enc
         bias = gen_bias(q_ids, ids, step)
         pos_ids = D.to_variable(np.tile(np.array([[step, step + 1]], dtype=np.int64), [d_batch * beam_width, 1]))
         pos_ids += seqlen
-        _, logits, info = model(ids, L.ones_like(ids) * 3, pos_ids=pos_ids, attn_bias=bias, past_cache=past_cache)
+        _, logits, info = model(ids, L.ones_like(ids) * tgt_type_id, pos_ids=pos_ids, attn_bias=bias, past_cache=past_cache)
 
         past_cached_k, past_cached_v = past_cache
         cached_k, cached_v = info['caches']
@@ -307,7 +307,8 @@ if __name__ == '__main__':
                 attn_id=tokenizer.vocab['[ATTN]'],
                 max_decode_len=args.max_decode_len, 
                 max_encode_len=args.max_encode_len, 
-                beam_width=args.beam_width)
+                beam_width=args.beam_width,
+                tgt_type_id=args.tgt_type_id)
 
         output_str = rev_lookup(result_ids.numpy())
         for ostr in output_str.tolist():
diff --git a/experimental/seq2seq/finetune_seq2seq_dygraph.py b/experimental/seq2seq/finetune_seq2seq_dygraph.py
index 2e088a0..b2fa470 100644
--- a/experimental/seq2seq/finetune_seq2seq_dygraph.py
+++ b/experimental/seq2seq/finetune_seq2seq_dygraph.py
@@ -64,10 +64,11 @@ def evaluate(model, datasets, step, args):
             output_ids = beam_search_infilling(model, src_ids, src_sids,
                     eos_id=tokenizer.sep_id,
                     sos_id=tokenizer.cls_id,
-                    attn_id=tokenizer.vocab['[ATTN]'],
+                    attn_id=tokenizer.vocab[args.attn_token],
                     max_decode_len=args.max_decode_len, 
                     max_encode_len=args.max_encode_len, 
-                    beam_width=args.beam_width)
+                    beam_width=args.beam_width,
+                    tgt_type_id=args.tgt_type_id,)
             output_str = rev_lookup(output_ids.numpy())
             for eid, ostr in zip(example_id.numpy().tolist(), output_str.tolist()):
                 if '[SEP]' in ostr:
@@ -80,7 +81,7 @@ def evaluate(model, datasets, step, args):
 
 def seq2seq(model, tokenizer, args):
     log.info('Training starts with args: %r' % args)
-    attn_id = tokenizer.vocab['[ATTN]']
+    attn_id = tokenizer.vocab[args.attn_token]
     def gen_mask(batch_ids, mask_type='bidi', query_len=None, pad_value=0):
         if query_len is None:
             query_len = batch_ids.shape[1]
@@ -224,11 +225,12 @@ def seq2seq(model, tokenizer, args):
     dev_ds.data_shapes = shapes
     dev_ds.data_types = types
 
+    vocab_size, _ = model.word_emb.weight.shape
     ctx = D.parallel.prepare_context()
     model = D.parallel.DataParallel(model, ctx)
     opt = AdamW(learning_rate=LinearDecay(args.lr, int(args.warmup_proportion * args.max_steps), args.max_steps), parameter_list=model.parameters(), weight_decay=args.wd)
     g_clip = F.dygraph_grad_clip.GradClipByGlobalNorm(1.0)
-    attn_id = tokenizer.vocab['[ATTN]']
+    attn_id = tokenizer.vocab[args.attn_token]
     for step, data in enumerate(train_ds.start(place)):
         (example_id, src_ids, src_sids, src_pids,
          tgt_ids, tgt_sids, tgt_pids,
@@ -242,7 +244,7 @@ def seq2seq(model, tokenizer, args):
         past_cache_k = [L.concat([k, k2], 1) for k, k2 in zip(cached_k, cached_k2)]
         past_cache_v = [L.concat([v, v2], 1) for v, v2 in zip(cached_v, cached_v2)]
         if args.label_smooth > 0.:
-            tgt_labels = L.label_smooth(F.one_hot(tgt_labels, len(tokenizer.vocab)), epsilon=args.label_smooth)
+            tgt_labels = L.label_smooth(F.one_hot(tgt_labels, vocab_size), epsilon=args.label_smooth)
         loss, _, __ = model(attn_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_attn_2_srctgtattn, 
                 past_cache=(past_cache_k, past_cache_v), 
                 tgt_labels=tgt_labels, 
@@ -283,12 +285,13 @@ if __name__ == '__main__':
     parser.add_argument('--max_decode_len', type=int, default=120)
     parser.add_argument('--tgt_type_id', type=int, default=3)
     parser.add_argument('--warmup_proportion', type=float, default=0.1)
-    parser.add_argument('--beam_width', type=int, default=3)
+    parser.add_argument('--beam_width', type=int, default=5)
     parser.add_argument('--noise_prob', type=float, default=0.7, help='probability of token be repalced')
     parser.add_argument('--use_random_noice', action='store_true', help='if set, replace target tokens with random token from vocabulary, else replace with `[NOISE]`')
     parser.add_argument('--lr', type=float, default=5e-5, help='learning rate')
     parser.add_argument('--label_smooth', type=float, default=0.1)
     parser.add_argument('--predict_output_dir', type=str, default=None, help='predict file output directory')
+    parser.add_argument('--attn_token', type=str, default='[ATTN]', help='if [ATTN] not in vocab, you can specified [MAKK] as attn-token')
     parser.add_argument('--inference_model_dir', type=str, default=None, help='inference model output directory')
     parser.add_argument('--save_dir', type=str, default=None, help='model output directory')
     parser.add_argument('--wd', type=float, default=0.01, help='weight decay, aka L2 regularizer')
diff --git a/experimental/seq2seq/modeling_ernie_gen.py b/experimental/seq2seq/modeling_ernie_gen.py
index a077335..cee4f1c 100644
--- a/experimental/seq2seq/modeling_ernie_gen.py
+++ b/experimental/seq2seq/modeling_ernie_gen.py
@@ -32,6 +32,7 @@ class ErnieModelForGeneration(ErnieModel):
     resource_map = {
         'ernie-gen-base-en': ErnieModel.bce + 'model-ernie-gen-base-en.1.tar.gz',
         'ernie-gen-large-en': ErnieModel.bce + 'model-ernie-gen-large-en.1.tar.gz',
+        'ernie-1.0': ErnieModel.bce + 'model-ernie1.0.1.tar.gz',
     }
     def __init__(self, cfg, name=None):
         cfg['return_additional_info'] = True
diff --git a/requirements.txt b/requirements.txt
index c7edd1c..54a1009 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,5 @@
-nltk==3.4
-numpy==1.14.5
+numpy
 pyzmq==18.0.2
-scikit-learn==0.20.3
-scipy==1.2.1
 six==1.11.0
 sklearn==0.0
 sentencepiece==0.1.8
-paddlepaddle-gpu==1.7.1.post107
diff --git a/setup.py b/setup.py
index b34b372..ac97709 100644
--- a/setup.py
+++ b/setup.py
@@ -22,16 +22,15 @@ with open("README.md", "r", encoding='utf-8') as fh:
 
 setuptools.setup(
     name="paddle-ernie", # Replace with your own username
-    version="0.0.1dev1",
+    version="0.0.2dev1",
     author="Baidu Ernie Team",
     author_email="ernieernie.team@gmail.com",
     description="A pretrained NLP model for every NLP tasks",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    url="https://github.com/PaddlePaddle/ERNIE/tree/dygraph",
+    url="https://github.com/PaddlePaddle/ERNIE/",
     packages=['ernie'],
     install_requires=[
-        'paddlepaddle-gpu>=1.7.1',
         'requests',
         'tqdm',
         ],
-- 
GitLab