From de4063b54f9b076b6435bc901e8ef0f131c5b585 Mon Sep 17 00:00:00 2001 From: Meiyim Date: Thu, 31 Dec 2020 14:30:53 +0800 Subject: [PATCH] Paddle 2.0 (#604) * update to paddle 2. * update readme * upgrade multi card fintune example * use paddle.AdamW, use grad acc * bump propeller * remove grad acc * fix ner * update propeller & distribued sample * wip * +seq2seq * format * fix erneigen * fix pretrain * fix static * update propeller for py37 compat * fix pretrain static * up readme * update readme * static pretrain * remove optimization out of core libray * ner use `cross_entropy`, use `ignore index` * fix dygraph pretrain: add stop criteria * bugfix, LN wrong initialize * add grad acc for classifiction task * seq2seq use fp32 when decoding * use `paddle.io.DataLoader` * + distill * update readme * update distill fig link * propeller use vdl * do not use pure fp16 for static graph Co-authored-by: chenxuyi --- .github/stale.yml | 1 - .pre-commit-config.yaml | 17 + README.en.md | 66 ++- README.zh.md | 49 +- demo/__init__.py | 0 {distill => demo/distill}/README.md | 11 +- demo/distill/distill.py | 298 ++++++++++ demo/finetune_classifier.py | 419 ++++++++------ demo/finetune_classifier_distributed.py | 205 +++++++ demo/finetune_classifier_dygraph.py | 157 ----- ...finetune_classifier_dygraph_distributed.py | 146 ----- demo/finetune_classifier_static.py | 251 ++++++++ demo/finetune_mrc.py | 247 ++++++++ demo/finetune_mrc_dygraph.py | 176 ------ demo/finetune_ner.py | 258 +++++++++ demo/finetune_ner_dygraph.py | 192 ------- demo/finetune_sentiment_analysis.py | 206 +++++++ demo/finetune_sentiment_analysis_dygraph.py | 157 ----- demo/mrc/mrc_metrics.py | 51 +- demo/mrc/mrc_reader.py | 65 ++- demo/optimization.py | 89 +++ demo/pretrain/README.md | 3 +- demo/pretrain/make_pretrain_data.py | 45 +- demo/pretrain/pretrain.py | 312 +++++----- ...pretrain_dygraph.py => pretrain_static.py} | 261 ++++++--- demo/seq2seq/README.md | 12 +- demo/seq2seq/decode.py | 470 +++++++++------ demo/seq2seq/finetune_seq2seq.py | 420 ++++++++++++++ demo/seq2seq/finetune_seq2seq_dygraph.py | 318 ---------- demo/utils.py | 44 ++ distill/distill.py | 239 -------- ernie-gen/README.md | 2 +- ernie-vil/README.md | 2 +- ernie/__init__.py | 13 +- ernie/file_utils.py | 18 +- ernie/modeling_ernie.py | 542 +++++++++++------- ernie/optimization.py | 203 ------- ernie/tokenizing_ernie.py | 103 ++-- experimental/seq2seq/README.md | 60 +- inference/README.md | 1 - inference/cpu/CMakeLists.txt | 1 - inference/gpu/CMakeLists.txt | 1 - patch | 35 ++ propeller/data/__init__.py | 2 + propeller/{paddle => }/data/example.proto | 2 +- propeller/{paddle => }/data/example_pb2.py | 30 +- propeller/{paddle => }/data/feature.proto | 0 propeller/data/feature_column.py | 516 +++++++++++++++++ propeller/{paddle => }/data/feature_pb2.py | 60 +- propeller/data/functional.py | 174 +++++- propeller/paddle/__init__.py | 14 +- propeller/paddle/data/__init__.py | 3 +- propeller/paddle/data/feature_column.py | 448 +-------------- propeller/paddle/data/functional.py | 5 +- propeller/paddle/train/distribution.py | 18 + propeller/paddle/train/hooks.py | 25 +- propeller/paddle/train/metrics.py | 159 ++++- propeller/paddle/train/monitored_executor.py | 99 ++-- propeller/paddle/train/trainer.py | 98 +++- propeller/tools/ckpt_inspector.py | 36 ++ propeller/types.py | 3 +- requirements.txt | 1 + setup.py | 10 +- 63 files changed, 4691 insertions(+), 3178 deletions(-) create mode 100644 .pre-commit-config.yaml create mode 100644 demo/__init__.py rename {distill => demo/distill}/README.md (97%) create mode 100644 demo/distill/distill.py create mode 100644 demo/finetune_classifier_distributed.py delete mode 100644 demo/finetune_classifier_dygraph.py delete mode 100644 demo/finetune_classifier_dygraph_distributed.py create mode 100644 demo/finetune_classifier_static.py create mode 100644 demo/finetune_mrc.py delete mode 100644 demo/finetune_mrc_dygraph.py create mode 100644 demo/finetune_ner.py delete mode 100644 demo/finetune_ner_dygraph.py create mode 100644 demo/finetune_sentiment_analysis.py delete mode 100644 demo/finetune_sentiment_analysis_dygraph.py create mode 100644 demo/optimization.py rename demo/pretrain/{pretrain_dygraph.py => pretrain_static.py} (50%) create mode 100644 demo/seq2seq/finetune_seq2seq.py delete mode 100644 demo/seq2seq/finetune_seq2seq_dygraph.py create mode 100644 demo/utils.py delete mode 100644 distill/distill.py delete mode 100644 ernie/optimization.py mode change 120000 => 100644 experimental/seq2seq/README.md create mode 100644 patch rename propeller/{paddle => }/data/example.proto (95%) rename propeller/{paddle => }/data/example_pb2.py (76%) rename propeller/{paddle => }/data/feature.proto (100%) create mode 100644 propeller/data/feature_column.py rename propeller/{paddle => }/data/feature_pb2.py (87%) diff --git a/.github/stale.yml b/.github/stale.yml index 652133a..9172b57 100644 --- a/.github/stale.yml +++ b/.github/stale.yml @@ -15,4 +15,3 @@ markComment: > Thank you for your contributions. # Comment to post when closing a stale issue. Set to `false` to disable closeComment: false - diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..9b28374 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,17 @@ +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks +repos: +- repo: https://github.com/PaddlePaddle/mirrors-yapf.git + rev: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37 + hooks: + - id: yapf + files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$ +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: 5bf6c09bfa1297d3692cadd621ef95f1284e33c0 + hooks: + - id: check-added-large-files + - id: check-merge-conflict + - id: check-symlinks + - id: detect-private-key + files: (?!.*third_party)^.*$ | (?!.*book)^.*$ + - id: end-of-file-fixer diff --git a/README.en.md b/README.en.md index 59312ee..2de6840 100644 --- a/README.en.md +++ b/README.en.md @@ -11,7 +11,13 @@ ERNIE 2.0 builds a strong basic for nearly every NLP tasks: Text Classification, [\[more information\]](https://wenxin.baidu.com/) # News -- Sept.24.2020: + +- Dec.29.2020: + - Pretrain and finetune ERNIE with [PaddlePaddle v2.0](https://github.com/PaddlePaddle/Paddle/tree/release/2.0-rc). + - New AMP(auto mixed precision) feature for every demo in this repo. + - Introducing `Gradient accumulation`, run `ERNIE-large` with only 8G memory. + +- Sept.24.2020: - [`ERNIE-ViL`](https://github.com/PaddlePaddle/ERNIE/tree/repro/ernie-vil) is **avaliable** now! - A **knowledge-enhanced** joint representations for vision-language tasks. - Constructing three **Scene Graph Prediction** tasks utilizing structured knowledge. @@ -20,20 +26,19 @@ ERNIE 2.0 builds a strong basic for nearly every NLP tasks: Text Classification, - May.20.2020: - Try ERNIE in "`dygraph`", with: - - Pretrain and finetune ERNIE with [PaddlePaddle v1.8](https://github.com/PaddlePaddle/Paddle/tree/release/1.8). - Eager execution with `paddle.fluid.dygraph`. - Distributed training. - Easy deployment. - Learn NLP in Aistudio tutorials. - Backward compatibility for old-styled checkpoint - + - [`ERNIE-GEN`](https://github.com/PaddlePaddle/ERNIE/tree/repro/ernie-gen) is **avaliable** now! ([link here](https://github.com/PaddlePaddle/ERNIE/tree/repro/ernie-gen)) - the **state-of-the-art** pre-trained model for generation tasks, accepted by `IJCAI-2020`. - A novel **span-by-span generation pre-training task**. - An **infilling generation** echanism and a **noise-aware generation** method. - Implemented by a carefully designed **Multi-Flow Attention** architecture. - You are able to `download` all models including `base/large/large-430G`. - + - Apr.30.2020: Release [ERNIESage](https://github.com/PaddlePaddle/PGL/tree/master/examples/erniesage), a novel Graph Neural Network Model using ERNIE as its aggregator. It is implemented through [PGL](https://github.com/PaddlePaddle/PGL) - Mar.27.2020: [Champion on 5 SemEval2020 sub tasks](https://www.jiqizhixin.com/articles/2020-03-27-8) - Dec.26.2019: [1st place on GLUE leaderboard](https://www.technologyreview.com/2019/12/26/131372/ai-baidu-ernie-google-bert-natural-language-glue/) @@ -41,7 +46,7 @@ ERNIE 2.0 builds a strong basic for nearly every NLP tasks: Text Classification, - Jul.7.2019: [Introducing ERNIE2.0](https://www.jiqizhixin.com/articles/2019-07-31-10) - Mar.16.2019: [Introducing ERNIE1.0](https://www.jiqizhixin.com/articles/2019-03-16-3) - + # Table of contents * [Tutorials](#tutorials) * [Setup](#setup) @@ -54,18 +59,16 @@ ERNIE 2.0 builds a strong basic for nearly every NLP tasks: Text Classification, ```python import numpy as np -import paddle.fluid.dygraph as D +import paddle as P from ernie.tokenizing_ernie import ErnieTokenizer from ernie.modeling_ernie import ErnieModel -D.guard().__enter__() # activate paddle `dygrpah` mode - model = ErnieModel.from_pretrained('ernie-1.0') # Try to get pretrained model from server, make sure you have network connection model.eval() tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0') ids, _ = tokenizer.encode('hello world') -ids = D.to_variable(np.expand_dims(ids, 0)) # insert extra `batch` dimension +ids = P.to_tensor(np.expand_dims(ids, 0)) # insert extra `batch` dimension pooled, encoded = model(ids) # eager execution print(pooled.numpy()) # convert results to numpy @@ -95,7 +98,7 @@ This repo requires PaddlePaddle 1.7.0+, please see [here](https://www.paddlepadd pip install paddle-ernie ``` -or +or ```shell git clone https://github.com/PaddlePaddle/ERNIE.git --depth 1 @@ -117,10 +120,10 @@ pip install -e . | [ERNIE Gen Large 430G for English](https://ernie-github.cdn.bcebos.com/model-ernie-gen-large-430g-en.1.tar.gz)| Layer:24, Hidden:1024, Heads:16 + 430G pretrain corpus | ernie-gen-large-430g-en | ##### 4. download datasets - + **English Datasets** -Download the [GLUE datasets](https://gluebenchmark.com/tasks) by running [this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e) +Download the [GLUE datasets](https://gluebenchmark.com/tasks) by running [this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e) the `--data_dir` option in the following section assumes a directory tree like this: @@ -152,11 +155,16 @@ see [demo](https://ernie-github.cdn.bcebos.com/data-mnli-m.tar.gz) data for MNLI - try eager execution with `dygraph model` : ```script -python3 ./ernie_d/demo/finetune_classifier_dygraph.py \ +python3 ./demo/finetune_classifier.py \ --from_pretrained ernie-1.0 \ - --data_dir ./data/xnli + --data_dir ./data/xnli ``` + - specify `--use_amp` to activate AMP training. + - `--bsz` denotes global batch size for one optimization step, `--micro_bsz` denotes maximum batch size for each GPU device. +if `--micro_bsz < --bsz`, gradient accumulation will be actiavted. + + - Distributed finetune `paddle.distributed.launch` is a process manager, we use it to launch python processes on each avalible GPU devices: @@ -165,15 +173,15 @@ When in distributed training, `max_steps` is used as stopping criteria rather th You could calculate `max_steps` with `EPOCH * NUM_TRAIN_EXAMPLES / TOTAL_BATCH`. Also notice than we shard the train data according to device id to prevent over fitting. -demo: -(make sure you have more than 2 GPUs, -online model download can not work in `paddle.distributed.launch`, -you need to run single card finetuning first to get pretrained model, or donwload and extract one manualy from [here](#section-pretrained-models)): +demo: +(make sure you have more than 2 GPUs, +online model download can not work in `paddle.distributed.launch`, +you need to run single card finetuning first to get pretrained model, or donwload and extract one manualy from [here](#section-pretrained-models)): ```script python3 -m paddle.distributed.launch \ -./demo/finetune_classifier_dygraph_distributed.py \ +./demo/finetune_classifier_distributed.py \ --data_dir data/mnli \ --max_steps 10000 \ --from_pretrained ernie-2.0-en @@ -182,11 +190,12 @@ python3 -m paddle.distributed.launch \ many other demo python scripts: -1. [Sentiment Analysis](./demo/finetune_sentiment_analysis_dygraph.py) -1. [Semantic Similarity](./demo/finetune_classifier_dygraph.py) -1. [Name Entity Recognition(NER)](./demo/finetune_ner_dygraph.py) -1. [Machine Reading Comprehension](./demo/finetune_mrc_dygraph.py) +1. [Sentiment Analysis](./demo/finetune_sentiment_analysis.py) +1. [Semantic Similarity](./demo/finetune_classifier.py) +1. [Name Entity Recognition(NER)](./demo/finetune_ner.py) +1. [Machine Reading Comprehension](./demo/finetune_mrc.py) 1. [Text generation](./demo/seq2seq/README.md) +1. [Text classification with `paddle.static` API](./demo/finetune_classifier_static.py) @@ -220,7 +229,7 @@ see [here](./demo/pretrain/README.md) # Online inference -If `--inference_model_dir` is passed to `finetune_classifier_dygraph.py`, +If `--inference_model_dir` is passed to `finetune_classifier_dygraph.py`, a deployable model will be generated at the end of finetuning and your model is ready to serve. For details about online inferece, see [C++ inference API](./inference/README.md), @@ -244,14 +253,14 @@ sids = np.expand_dims(sids, 0) result = client(ids, sids) ``` -A pre-made `inference model` for ernie-1.0 can be downloaded at [here](https://ernie.bj.bcebos.com/ernie1.0_zh_inference_model.tar.gz). +A pre-made `inference model` for ernie-1.0 can be downloaded at [here](https://ernie.bj.bcebos.com/ernie1.0_zh_inference_model.tar.gz). It can be used for feature-based finetuning or feature extraction. # Distillation -Knowledge distillation is good way to compress and accelerate ERNIE. +Knowledge distillation is good way to compress and accelerate ERNIE. -For details about distillation, see [here](./distill/README.md) +For details about distillation, see [here](./demo/distill/README.md) # Citation @@ -271,7 +280,7 @@ For details about distillation, see [here](./distill/README.md) title={ERNIE 2.0: A Continual Pre-training Framework for Language Understanding}, author={Sun, Yu and Wang, Shuohuan and Li, Yukun and Feng, Shikun and Tian, Hao and Wu, Hua and Wang, Haifeng}, journal={arXiv preprint arXiv:1907.12412}, - year={2019} + year={2019} } ``` @@ -306,4 +315,3 @@ For full reproduction of paper results, please checkout to `repro` branch of thi - QQ discussion group: 760439550 (ERNIE discussion group). - QQ discussion group: 958422639 (ERNIE discussion group-v2). - [Forums](http://ai.baidu.com/forum/topic/list/168?pageNo=1): discuss implementations, research, etc. - diff --git a/README.zh.md b/README.zh.md index b874cd3..a895020 100644 --- a/README.zh.md +++ b/README.zh.md @@ -10,16 +10,20 @@ ERNIE是百度开创性提出的基于知识增强的持续学习语义理解框 # 新闻 -- 2020.9.24: +- 2020.12.29: + - `ERNIE`开源工具套件全面升级 [PaddlePaddle v2.0](https://github.com/PaddlePaddle/Paddle/tree/release/2.0-rc) + - 所有demo教程均引入AMP(混合精度训练), 平均提速达2.3倍。 + - 引入`Gradient accumulation`, 8G显存也可运行`ERNIE-large`模型。 + +- 2020.9.24: - `ERNIE-ViL` 模型正式开源! ([点击进入](https://github.com/PaddlePaddle/ERNIE/tree/repro/ernie-vil)) - 面向视觉-语言知识增强的预训练框架,首次在视觉-语言预训练引入结构化的知识。 - 利用场景图中的知识,构建了物体、属性和关系预测任务,精细刻画模态间细粒度语义对齐。 - 五项视觉-语言下游任务取得最好效果,[视觉常识推理榜单](https://visualcommonsense.com/)取得第一。 - - -- 2020.5.20: + + +- 2020.5.20: - 欢迎试用`动态图`实现的 ERNIE: - - 基于[PaddlePaddle v1.8](https://github.com/PaddlePaddle/Paddle/tree/release/1.8)使用 ERNIE 进行 Pretrain 和 Finetune. - 动态执行, 所见即所得。 - 大规模分布式训练。 - 易于部署。 @@ -52,18 +56,16 @@ ERNIE是百度开创性提出的基于知识增强的持续学习语义理解框 # 快速上手 ```python import numpy as np -import paddle.fluid.dygraph as D +import paddle as P from ernie.tokenizing_ernie import ErnieTokenizer from ernie.modeling_ernie import ErnieModel -D.guard().__enter__() # activate paddle `dygrpah` mode - model = ErnieModel.from_pretrained('ernie-1.0') # Try to get pretrained model from server, make sure you have network connection model.eval() tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0') ids, _ = tokenizer.encode('hello world') -ids = D.to_variable(np.expand_dims(ids, 0)) # insert extra `batch` dimension +ids = P.to_tensor(np.expand_dims(ids, 0)) # insert extra `batch` dimension pooled, encoded = model(ids) # eager execution print(pooled.numpy()) # convert results to numpy @@ -71,7 +73,7 @@ print(pooled.numpy()) # convert results to numpy # 教程 -手边没有GPU?欢迎在[AIStudio](https://aistudio.baidu.com/aistudio/index)中直接试用 ERNIE. +手边没有GPU?欢迎在[AIStudio](https://aistudio.baidu.com/aistudio/index)中直接试用 ERNIE. (请选择最新版本的教程并申请GPU运行环境) 1. [从0开始学ERNIE](https://aistudio.baidu.com/studio/edu/group/quick/join/314947) @@ -159,11 +161,16 @@ data/xnli - 使用 `动态图` 模型进行finetune: ```script -python3 ./ernie_d/demo/finetune_classifier_dygraph.py \ +python3 ./ernie_d/demo/finetune_classifier.py \ --from_pretrained ernie-1.0 \ - --data_dir ./data/xnli + --data_dir ./data/xnli ``` + - 加入`--use_amp`以启用AMP功能(请在支持`TensorCore`设备上启用AMP) + - 通过`--bsz`指定全局batch\_size(一步优化中模型所能见到的样本数), 通过`--micro_bsz` 指定输入给每一张GPU卡的样本数 +若`--bsz > --micro_bsz` 脚本会自动开启梯度累计功能. + + - 分布式 finetune `paddle.distributed.launch` 是一个进程管理器,我们采用它在每一张GPU上启动一个python进程,并配置相应的环境变量以进行分布式训练: @@ -177,7 +184,7 @@ python3 ./ernie_d/demo/finetune_classifier_dygraph.py \ ```script python3 -m paddle.distributed.launch \ -./demo/finetune_classifier_dygraph_distributed.py \ +./demo/finetune_classifier_distributed.py \ --data_dir data/mnli \ --max_steps 10000 \ --from_pretrained ernie2.0-en @@ -186,11 +193,12 @@ python3 -m paddle.distributed.launch \ 更多示例脚本: -1. [情感分析](./demo/finetune_sentiment_analysis_dygraph.py) -1. [语义匹配](./demo/finetune_classifier_dygraph.py) -1. [命名实体识别(NER)](./demo/finetune_ner_dygraph.py) -1. [机器阅读理解](./demo/finetune_mrc_dygraph.py) (需要多卡环境运行;参见上面"分布式 finetune"一节) +1. [情感分析](./demo/finetune_sentiment_analysis.py) +1. [语义匹配](./demo/finetune_classifier.py) +1. [命名实体识别(NER)](./demo/finetune_ner.py) +1. [机器阅读理解](./demo/finetune_mrc.py) (需要多卡环境运行;参见上面"分布式 finetune"一节) 1. [文本摘要生成](./demo/seq2seq/README.md) +1. [使用静态图完成文本分类](./demo/finetune_classifier_static.py) **推荐超参数设置:** @@ -221,7 +229,7 @@ python3 -m paddle.distributed.launch \ # 在线预测 -如果`finetune_classifier_dygraph.py`中指定了`--inference_model_dir`参数,funetune脚本会将你的模型序列化并产出可以直接部署线上预测的`inference_model`. +如果`finetune_classifier.py`中指定了`--inference_model_dir`参数,funetune脚本会将你的模型序列化并产出可以直接部署线上预测的`inference_model`. 关于生产环境中使用线上预测代码的实现细节,请见[C++ inference API](./inference/README.md). 或者你可以使用`propeller`启动一个多GPU预测服务(需要GPU环境),只需执行: @@ -254,7 +262,7 @@ ids = np.expand_dims(ids, -1) # ids.shape==[BATCH, SEQLEN, 1] # 蒸馏 -知识蒸馏是进行ERNIE模型压缩、加速的有效方式;关于知识蒸馏的实现细节请参见[这里](./distill/README.md)。 +知识蒸馏是进行ERNIE模型压缩、加速的有效方式;关于知识蒸馏的实现细节请参见[这里](./demo/distill/README.md)。 # 文献引用 @@ -274,7 +282,7 @@ ids = np.expand_dims(ids, -1) # ids.shape==[BATCH, SEQLEN, 1] title={ERNIE 2.0: A Continual Pre-training Framework for Language Understanding}, author={Sun, Yu and Wang, Shuohuan and Li, Yukun and Feng, Shikun and Tian, Hao and Wu, Hua and Wang, Haifeng}, journal={arXiv preprint arXiv:1907.12412}, - year={2019} + year={2019} } ``` @@ -309,4 +317,3 @@ ids = np.expand_dims(ids, -1) # ids.shape==[BATCH, SEQLEN, 1] - QQ 群: 760439550 (ERNIE discussion group). - QQ 2群: 958422639 (ERNIE discussion group-v2). - [Forums](http://ai.baidu.com/forum/topic/list/168?pageNo=1): discuss implementations, research, etc. - diff --git a/demo/__init__.py b/demo/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/distill/README.md b/demo/distill/README.md similarity index 97% rename from distill/README.md rename to demo/distill/README.md index 78c3e73..bf31420 100644 --- a/distill/README.md +++ b/demo/distill/README.md @@ -9,7 +9,7 @@ # ERNIE Slim 数据蒸馏 在ERNIE强大的语义理解能力背后,是需要同样强大的算力才能支撑起如此大规模模型的训练和预测。很多工业应用场景对性能要求较高,若不能有效压缩则无法实际应用。 -![ernie_distill](../.metas/ernie_distill.png) +![ernie_distill](../../.metas/ernie_distill.png) 因此,如上图所示,我们基于[数据蒸馏技术](https://arxiv.org/pdf/1712.04440.pdf)构建了**ERNIE Slim数据蒸馏系统**。它的原理是通过数据作为桥梁,将ERNIE模型的知识迁移至小模型,以达到损失很小的效果却能达到上千倍的预测速度提升的效果。 @@ -18,11 +18,11 @@ - **Step 1**. 使用ERNIE模型对输入标注数据对进行fine-tune,得到Teacher Model - **Step 2**. 使用ERNIE Service对以下无监督数据进行预测: - + 1. 用户提供的大规模无标注数据,需与标注数据同源 2. 对标注数据进行数据增强,具体增强策略见下节 - 3. 对无标注数据和数据增强数据进行一定比例混合 - + 3. 对无标注数据和数据增强数据进行一定比例混合 + - **Step 3.** 使用步骤2的数据训练出Student Model @@ -59,7 +59,6 @@ python ./distill/distill.py |---|---| |ERNIE-Finetune |95.4% | |非ERNIE基线(BOW)|90.1%| -|**+ 数据蒸馏** |91.4%| +|**+ 数据蒸馏** |91.4%| |非ERNIE基线(LSTM)|91.2%| |**+ 数据蒸馏**|93.9%| - diff --git a/demo/distill/distill.py b/demo/distill/distill.py new file mode 100644 index 0000000..a8c2c34 --- /dev/null +++ b/demo/distill/distill.py @@ -0,0 +1,298 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import os + +import numpy as np +from sklearn.metrics import f1_score +import paddle as P +from paddle.nn import functional as F +import propeller.paddle as propeller + +from ernie.tokenizing_ernie import ErnieTokenizer +from ernie.modeling_ernie import ErnieModelForSequenceClassification +from demo.utils import create_if_not_exists, get_warmup_and_linear_decay + +# 本例子采用chnsenticorp中文情感识别任务作为示范;并且事先通过数据增强扩充了蒸馏所需的无监督数据 +# +# 下载数据;并存放在 ./chnsenticorp-data/ +# 数据分为3列:原文;空格切词;情感标签 +# 其中第一列为ERNIE的输入;第二列为BoW词袋模型的输入 +# 事先统计好的BoW 词典在 ./chnsenticorp-data/vocab.bow.txt + +# 定义finetune teacher模型所需要的超参数 +DATA_DIR = './chnsenticorp-data/' +SEQLEN = 256 +BATCH = 32 +EPOCH = 10 +LR = 5e-5 + +tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0') + +student_vocab = { + i.strip(): l + for l, i in enumerate( + open( + os.path.join(DATA_DIR, 'vocab.bow.txt'), encoding='utf8') + .readlines()) +} + + +def space_tokenizer(i): + return i.decode('utf8').split() + + +feature_column = propeller.data.FeatureColumns([ + propeller.data.TextColumn( + 'seg_a', + unk_id=tokenizer.unk_id, + vocab_dict=tokenizer.vocab, + tokenizer=tokenizer.tokenize), + propeller.data.TextColumn( + 'seg_a_student', + unk_id=student_vocab['[UNK]'], + vocab_dict=student_vocab, + tokenizer=space_tokenizer), + propeller.data.LabelColumn( + 'label', vocab_dict={ + b"0": 0, + b"1": 1, + }), +]) + + +def map_fn(seg_a, seg_a_student, label): + seg_a, _ = tokenizer.truncate(seg_a, [], seqlen=SEQLEN) + sentence, segments = tokenizer.build_for_ernie(seg_a) + return seg_a_student, sentence, segments, label + + +train_ds = feature_column.build_dataset('train', data_dir=os.path.join(DATA_DIR, 'train/'), shuffle=True, repeat=False, use_gz=False) \ + .map(map_fn) \ + .padded_batch(BATCH) + +train_ds_unlabel = feature_column.build_dataset('train-da', data_dir=os.path.join(DATA_DIR, 'train-data-augmented/'), shuffle=True, repeat=False, use_gz=False) \ + .map(map_fn) \ + .padded_batch(BATCH) + +dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(DATA_DIR, 'dev/'), shuffle=False, repeat=False, use_gz=False) \ + .map(map_fn) \ + .padded_batch(BATCH,) + +shapes = ([-1, SEQLEN], [-1, SEQLEN], [-1, SEQLEN], [-1]) +types = ('int64', 'int64', 'int64', 'int64') + +train_ds.data_shapes = shapes +train_ds.data_types = types +train_ds_unlabel.data_shapes = shapes +train_ds_unlabel.data_types = types +dev_ds.data_shapes = shapes +dev_ds.data_types = types + +place = P.CUDAPlace(0) + + +def evaluate_teacher(model, dataset): + all_pred, all_label = [], [] + with P.no_grad(): + model.eval() + for step, (ids_student, ids, _, labels) in enumerate( + P.io.DataLoader( + dataset, places=place, batch_size=None)): + _, logits = model(ids) + pred = logits.argmax(-1) + all_pred.extend(pred.numpy()) + all_label.extend(labels.numpy()) + f1 = f1_score(all_label, all_pred, average='macro') + model.train() + return f1 + + +teacher_model = ErnieModelForSequenceClassification.from_pretrained( + 'ernie-1.0', num_labels=2) +teacher_model.train() +if not os.path.exists('./teacher_model.bin'): + g_clip = P.nn.ClipGradByGlobalNorm(1.0) #experimental + lr_scheduler = P.optimizer.lr.LambdaDecay( + LR, + get_warmup_and_linear_decay(9600 * EPOCH / BATCH, + 9600 * EPOCH * 0.1 / BATCH)) + + opt = P.optimizer.AdamW( + lr_scheduler, + parameters=teacher_model.parameters(), + weight_decay=0.01, + grad_clip=g_clip) + for epoch in range(EPOCH): + for step, (ids_student, ids, sids, labels) in enumerate( + P.io.DataLoader( + train_ds, places=place, batch_size=None)): + loss, logits = teacher_model(ids, labels=labels) + loss.backward() + opt.step() + lr_scheduler.step() + teacher_model.clear_gradients() + + if step % 10 == 0: + _lr = lr_scheduler.get_lr() + _l = loss.numpy() + msg = '[step-%d] train loss %.5f lr %.3e' % (step, _l, _lr) + print(msg) + if step % 100 == 0: + f1 = evaluate_teacher(teacher_model, dev_ds) + print('teacher f1: %.5f' % f1) + P.save(teacher_model.state_dict(), './teacher_model.bin') +else: + state_dict = P.load('./teacher_model.bin') + teacher_model.set_state_dict(state_dict) + f1 = evaluate_teacher(teacher_model, dev_ds) + print('teacher f1: %.5f' % f1) + +# 定义finetune student 模型所需要的超参数 +SEQLEN = 256 +BATCH = 32 +EPOCH = 10 +LR = 1e-4 + + +def evaluate_student(model, dataset): + all_pred, all_label = [], [] + with P.no_grad(): + model.eval() + for step, (ids_student, ids, _, labels) in enumerate( + P.io.DataLoader( + dataset, places=place, batch_size=None)): + _, logits = model(ids_student) + pred = logits.argmax(-1) + all_pred.extend(pred.numpy()) + all_label.extend(labels.numpy()) + f1 = f1_score(all_label, all_pred, average='macro') + model.train() + return f1 + + +class BOW(P.nn.Layer): + def __init__(self): + super().__init__() + self.emb = P.nn.Embedding(len(student_vocab), 128, padding_idx=0) + self.fc = P.nn.Linear(128, 2) + + def forward(self, ids, labels=None): + embbed = self.emb(ids) + pad_mask = (ids != 0).cast('float32').unsqueeze(-1) + + embbed = (embbed * pad_mask).sum(1) + embbed = F.softsign(embbed) + logits = self.fc(embbed) + if labels is not None: + if len(labels.shape) == 1: + labels = labels.reshape([-1, 1]) + loss = F.cross_entropy(logits, labels).mean() + else: + loss = None + return loss, logits + + +class CNN(P.nn.Layer): + def __init__(self): + super().__init__() + self.emb = P.nn.Embedding(30002, 128, padding_idx=0) + self.cnn = P.nn.Conv2D(128, 128, (1, 3), padding=(0, 1), act='relu') + self.pool = P.nn.Pool2D((1, 3), pool_padding=(0, 1)) + self.fc = P.nn.Linear(128, 2) + + def forward(self, ids, labels=None): + embbed = self.emb(ids) + #d_batch, d_seqlen = ids.shape + hidden = embbed + hidden = hidden.transpose([0, 2, 1]).unsqueeze(2) #change to NCWH + hidden = self.cnn(hidden) + hidden = self.pool(hidden).squeeze(2).transpose([0, 2, 1]) + pad_mask = (ids != 0).cast('float32').unsqueeze(-1) + hidden = P.nn.funcional.softsign(L(hidden * pad_mask).sum(1)) + logits = self.fc(hidden) + if labels is not None: + if len(labels.shape) == 1: + labels = labels.reshape([-1, 1]) + loss = F.cross_entropy(logits, labels).mean() + else: + loss = None + return loss, logits + + +def KL(pred, target): + pred = F.log_softmax(pred) + target = F.softmax(target) + loss = F.kl_div(pred, target) + return loss + + +teacher_model.eval() +model = BOW() +g_clip = P.nn.ClipGradByGlobalNorm(1.0) #experimental + +lr_scheduler = P.optimizer.lr.LambdaDecay( + LR, + get_warmup_and_linear_decay(9600 * EPOCH / BATCH, + 9600 * EPOCH * 0.1 / BATCH)) + +opt = P.optimizer.AdamW( + lr_scheduler, + parameters=model.parameters(), + weight_decay=0.01, + grad_clip=g_clip) +model.train() + +for epoch in range(EPOCH - 1): + for step, ( + ids_student, ids, sids, label + ) in enumerate(P.io.DataLoader( + train_ds, places=place, batch_size=None)): + with P.no_grad(): + _, logits_t = teacher_model(ids, sids) # teacher 模型输出logits + _, logits_s = model(ids_student) # student 模型输出logits + loss_ce, _ = model(ids_student, labels=label) + loss_kd = KL(logits_s, logits_t.detach()) # 由KL divergence度量两个分布的距离 + loss = loss_ce + loss_kd + loss.backward() + opt.step() + lr_scheduler.step() + model.clear_gradients() + if step % 10 == 0: + _lr = lr_scheduler.get_lr() + _l = loss.numpy() + msg = '[step-%d] train loss %.5f lr %.3e' % (step, _l, _lr) + print(msg) + + f1 = evaluate_student(model, dev_ds) + print('student f1 %.5f' % f1) + +# 最后再加一轮hard label训练巩固结果 +for step, ( + ids_student, ids, sids, label +) in enumerate(P.io.DataLoader( + train_ds, places=place, batch_size=None)): + loss, _ = model(ids_student, labels=label) + loss.backward() + opt.step() + model.clear_gradients() + if step % 10 == 0: + _lr = lr_scheduler.get_lr() + _l = loss.numpy() + msg = '[step-%d] train loss %.5f lr %.3e' % (step, _l, _lr) + print(msg) + +f1 = evaluate_student(model, dev_ds) +print('final f1 %.5f' % f1) diff --git a/demo/finetune_classifier.py b/demo/finetune_classifier.py index 7f610e9..4e9540d 100644 --- a/demo/finetune_classifier.py +++ b/demo/finetune_classifier.py @@ -11,204 +11,257 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import absolute_import - import os import re import time import logging -from random import random import json +from random import random from functools import reduce, partial +from visualdl import LogWriter import numpy as np -import multiprocessing -import tempfile -import re - -import paddle -import paddle.fluid as F -import paddle.fluid.layers as L - - -from ernie.modeling_ernie import ErnieModel, ErnieModelForSequenceClassification -from ernie.tokenizing_ernie import ErnieTokenizer, ErnieTinyTokenizer -from ernie.optimization import optimization -#import utils.data +import logging +import argparse +from pathlib import Path +import paddle as P from propeller import log import propeller.paddle as propeller - log.setLevel(logging.DEBUG) logging.getLogger().setLevel(logging.DEBUG) -def model_fn(features, mode, params, run_config): - ernie = ErnieModelForSequenceClassification(params, name='') - if not params is propeller.RunMode.TRAIN: - ernie.eval() - - metrics, loss = None, None - if mode is propeller.RunMode.PREDICT: - src_ids, sent_ids = features - _, logits = ernie(src_ids, sent_ids) - predictions = [logits,] - else: - src_ids, sent_ids, labels = features - if mode is propeller.RunMode.EVAL: - loss, logits = ernie(src_ids, sent_ids, labels=labels) - pred = L.argmax(logits, axis=1) - acc = propeller.metrics.Acc(labels, pred) - metrics = {'acc': acc} - predictions = [pred] - else: - loss, logits = ernie(src_ids, sent_ids, labels=labels) - scheduled_lr, _ = optimization( - loss=loss, - warmup_steps=int(run_config.max_steps * params['warmup_proportion']), - num_train_steps=run_config.max_steps, - learning_rate=params['learning_rate'], - train_program=F.default_main_program(), - startup_prog=F.default_startup_program(), - use_fp16=params.use_fp16, - weight_decay=params['weight_decay'], - scheduler="linear_warmup_decay", - ) - propeller.summary.scalar('lr', scheduled_lr) - predictions = [logits,] - - return propeller.ModelSpec(loss=loss, mode=mode, metrics=metrics, predictions=predictions) - - -if __name__ == '__main__': - parser = propeller.ArgumentParser('DAN model with Paddle') - parser.add_argument('--do_predict', action='store_true') - parser.add_argument('--max_seqlen', type=int, default=128) - parser.add_argument('--data_dir', type=str, required=True) - parser.add_argument('--from_pretrained', type=str, required=True) - parser.add_argument('--warm_start_from', type=str) - parser.add_argument('--epoch', type=int, default=3) - parser.add_argument('--use_fp16', action='store_true') - - args = parser.parse_args() - - if not os.path.exists(args.from_pretrained): - raise ValueError('--from_pretrained not found: %s' % args.from_pretrained) - cfg_file_path = os.path.join(args.from_pretrained, 'ernie_config.json') - param_path = os.path.join(args.from_pretrained, 'params') - vocab_path = os.path.join(args.from_pretrained, 'vocab.txt') - - assert os.path.exists(cfg_file_path) and os.path.exists(param_path) and os.path.exists(vocab_path) - - hparams_cli = propeller.parse_hparam(args) - hparams_config_file = json.loads(open(cfg_file_path).read()) - default_hparams = propeller.HParams( - batch_size=32, - num_labels=3, - warmup_proportion=0.1, - learning_rate=5e-5, - weight_decay=0.01, - use_task_id=False, - use_fp16=args.use_fp16, - ) - - hparams = default_hparams.join(propeller.HParams(**hparams_config_file)).join(hparams_cli) - - default_run_config=dict( - max_steps=args.epoch * 390000 / hparams.batch_size, - save_steps=1000, - log_steps=10, - max_ckpt=1, - skip_steps=0, - model_dir=tempfile.mkdtemp(), - eval_steps=100) - run_config = dict(default_run_config, **json.loads(args.run_config)) - run_config = propeller.RunConfig(**run_config) - - tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained) - #tokenizer = ErnieTinyTokenizer.from_pretrained(args.from_pretrained) - unk_id = tokenizer.vocab['[UNK]'] - - shapes = ([-1, args.max_seqlen], [-1, args.max_seqlen], [-1]) - types = ('int64', 'int64', 'int64') - if not args.do_predict: - feature_column = propeller.data.FeatureColumns([ - propeller.data.TextColumn('title', unk_id=unk_id, vocab_dict=tokenizer.vocab, tokenizer=tokenizer.tokenize), - propeller.data.TextColumn('comment', unk_id=unk_id, vocab_dict=tokenizer.vocab, tokenizer=tokenizer.tokenize), - propeller.data.LabelColumn('label', vocab_dict={ - b"contradictory": 0, - b"contradiction": 0, - b"entailment": 1, - b"neutral": 2, - }), - ]) - - def map_fn(seg_a, seg_b, label): - seg_a, seg_b = tokenizer.truncate(seg_a, seg_b, seqlen=args.max_seqlen) - sentence, segments = tokenizer.build_for_ernie(seg_a, seg_b) - #label = np.expand_dims(label, -1) # - return sentence, segments, label - - train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=True, repeat=True, use_gz=False) \ - .map(map_fn) \ - .padded_batch(hparams.batch_size) - - dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \ - .map(map_fn) \ - .padded_batch(hparams.batch_size) - - test_ds = feature_column.build_dataset('test', data_dir=os.path.join(args.data_dir, 'test'), shuffle=False, repeat=False, use_gz=False) \ - .map(map_fn) \ - .padded_batch(hparams.batch_size) \ - - train_ds.data_shapes = shapes - train_ds.data_types = types - dev_ds.data_shapes = shapes - dev_ds.data_types = types - test_ds.data_shapes = shapes - test_ds.data_types = types - - varname_to_warmstart = re.compile(r'^encoder.*[wb]_0$|^.*embedding$|^.*bias$|^.*scale$|^pooled_fc.[wb]_0$') - - ws = propeller.WarmStartSetting( - predicate_fn=lambda v: varname_to_warmstart.match(v.name) and os.path.exists(os.path.join(param_path, v.name)), - from_dir=param_path, - ) - - best_exporter = propeller.train.exporter.BestExporter(os.path.join(run_config.model_dir, 'best'), cmp_fn=lambda old, new: new['dev']['acc'] > old['dev']['acc']) - propeller.train.train_and_eval( - model_class_or_model_fn=model_fn, - params=hparams, - run_config=run_config, - train_dataset=train_ds, - eval_dataset={'dev': dev_ds, 'test': test_ds}, - warm_start_setting=ws, - exporters=[best_exporter]) - - print('dev_acc3\t%.5f\ntest_acc3\t%.5f' % (best_exporter._best['dev']['acc'], best_exporter._best['test']['acc'])) - - else: - feature_column = propeller.data.FeatureColumns([ - propeller.data.TextColumn('title', unk_id=unk_id, vocab_dict=tokenizer.vocab, tokenizer=tokenizer.tokenize), - propeller.data.TextColumn('comment', unk_id=unk_id, vocab_dict=tokenizer.vocab, tokenizer=tokenizer.tokenize), - ]) - - def map_fn(seg_a, seg_b): - seg_a, seg_b = tokenizer.truncate(seg_a, seg_b, seqlen=args.max_seqlen) - sentence, segments = tokenizer.build_for_ernie(seg_a, seg_b) - return sentence, segments - - - predict_ds = feature_column.build_dataset_from_stdin('predict') \ +#from model.bert import BertConfig, BertModelLayer +from ernie.modeling_ernie import ErnieModel, ErnieModelForSequenceClassification +from ernie.tokenizing_ernie import ErnieTokenizer, ErnieTinyTokenizer +#from ernie.optimization import AdamW, LinearDecay +from demo.utils import create_if_not_exists, get_warmup_and_linear_decay + +parser = argparse.ArgumentParser('classify model with ERNIE') +parser.add_argument( + '--from_pretrained', + type=Path, + required=True, + help='pretrained model directory or tag') +parser.add_argument( + '--max_seqlen', + type=int, + default=128, + help='max sentence length, should not greater than 512') +parser.add_argument( + '--bsz', + type=int, + default=128, + help='global batch size for each optimizer step') +parser.add_argument( + '--micro_bsz', + type=int, + default=32, + help='batch size for each device. if `--bsz` > `--micro_bsz` * num_device, will do grad accumulate' +) +parser.add_argument('--epoch', type=int, default=3, help='epoch') +parser.add_argument( + '--data_dir', + type=str, + required=True, + help='data directory includes train / develop data') +parser.add_argument( + '--use_lr_decay', + action='store_true', + help='if set, learning rate will decay to zero at `max_steps`') +parser.add_argument( + '--warmup_proportion', + type=float, + default=0.1, + help='if use_lr_decay is set, ' + 'learning rate will raise to `lr` at `warmup_proportion` * `max_steps` and decay to 0. at `max_steps`' +) +parser.add_argument('--lr', type=float, default=5e-5, help='learning rate') +parser.add_argument( + '--inference_model_dir', + type=Path, + default=None, + help='inference model output directory') +parser.add_argument( + '--save_dir', type=Path, required=True, help='model output directory') +parser.add_argument( + '--max_steps', + type=int, + default=None, + help='max_train_steps, set this to EPOCH * NUM_SAMPLES / BATCH_SIZE') +parser.add_argument( + '--wd', type=float, default=0.01, help='weight decay, aka L2 regularizer') +parser.add_argument( + '--init_checkpoint', + type=str, + default=None, + help='checkpoint to warm start from') +parser.add_argument( + '--use_amp', + action='store_true', + help='only activate AMP(auto mixed precision accelatoin) on TensorCore compatible devices' +) + +args = parser.parse_args() + +if args.bsz > args.micro_bsz: + assert args.bsz % args.micro_bsz == 0, 'cannot perform gradient accumulate with bsz:%d micro_bsz:%d' % ( + args.bsz, args.micro_bsz) + acc_step = args.bsz // args.micro_bsz + log.info( + 'performing gradient accumulate: global_bsz:%d, micro_bsz:%d, accumulate_steps:%d' + % (args.bsz, args.micro_bsz, acc_step)) + args.bsz = args.micro_bsz +else: + acc_step = 1 + +tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained) +#tokenizer = ErnieTinyTokenizer.from_pretrained(args.from_pretrained) + +feature_column = propeller.data.FeatureColumns([ + propeller.data.TextColumn( + 'seg_a', + unk_id=tokenizer.unk_id, + vocab_dict=tokenizer.vocab, + tokenizer=tokenizer.tokenize), + propeller.data.TextColumn( + 'seg_b', + unk_id=tokenizer.unk_id, + vocab_dict=tokenizer.vocab, + tokenizer=tokenizer.tokenize), + propeller.data.LabelColumn( + 'label', + vocab_dict={ + b"contradictory": 0, + b"contradiction": 0, + b"entailment": 1, + b"neutral": 2, + }), +]) + + +def map_fn(seg_a, seg_b, label): + seg_a, seg_b = tokenizer.truncate(seg_a, seg_b, seqlen=args.max_seqlen) + sentence, segments = tokenizer.build_for_ernie(seg_a, seg_b) + return sentence, segments, label + + +train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=True, repeat=False, use_gz=False) \ .map(map_fn) \ - .padded_batch(hparams.batch_size) \ - - predict_ds.data_shapes = shapes[: -1] - predict_ds.data_types = types[: -1] - - est = propeller.Learner(model_fn, run_config, hparams) - for res, in est.predict(predict_ds, ckpt=-1): - print('%d\t%.5f\t%.5f\t%.5f' % (np.argmax(res), res[0], res[1], res[2])) - + .padded_batch(args.bsz, (0, 0, 0)) +dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \ + .map(map_fn) \ + .padded_batch(args.bsz, (0, 0, 0)) + +place = P.CUDAPlace(0) +model = ErnieModelForSequenceClassification.from_pretrained( + args.from_pretrained, num_labels=3, name='') + +if args.init_checkpoint is not None: + log.info('loading checkpoint from %s' % args.init_checkpoint) + sd = P.load(args.init_checkpoint) + model.set_state_dict(sd) + +g_clip = P.nn.ClipGradByGlobalNorm(1.0) #experimental +param_name_to_exclue_from_weight_decay = re.compile( + r'.*layer_norm_scale|.*layer_norm_bias|.*b_0') +if args.use_lr_decay: + lr_scheduler = P.optimizer.lr.LambdaDecay( + args.lr, + get_warmup_and_linear_decay( + args.max_steps, int(args.warmup_proportion * args.max_steps))) + opt = P.optimizer.AdamW( + lr_scheduler, + parameters=model.parameters(), + weight_decay=args.wd, + apply_decay_param_fun=lambda n: param_name_to_exclue_from_weight_decay.match(n), + grad_clip=g_clip) +else: + lr_scheduler = None + opt = P.optimizer.Adam( + args.lr, + parameters=model.parameters(), + weight_decay=args.wd, + apply_decay_param_fun=lambda n: param_name_to_exclue_from_weight_decay.match(n), + grad_clip=g_clip) + +scaler = P.amp.GradScaler(enable=args.use_amp) +step, inter_step = 0, 0 +with LogWriter( + logdir=str(create_if_not_exists(args.save_dir / 'vdl'))) as log_writer: + with P.amp.auto_cast(enable=args.use_amp): + for epoch in range(args.epoch): + for ids, sids, label in P.io.DataLoader( + train_ds, places=P.CUDAPlace(0), batch_size=None): + inter_step += 1 + loss, _ = model(ids, sids, labels=label) + loss /= acc_step + loss = scaler.scale(loss) + loss.backward() + if inter_step % acc_step != 0: + continue + step += 1 + scaler.minimize(opt, loss) + model.clear_gradients() + lr_scheduler and lr_scheduler.step() + + if step % 10 == 0: + _lr = lr_scheduler.get_lr() + if args.use_amp: + _l = (loss / scaler._scale).numpy() + msg = '[step-%d] train loss %.5f lr %.3e scaling %.3e' % ( + step, _l, _lr, scaler._scale.numpy()) + else: + _l = loss.numpy() + msg = '[step-%d] train loss %.5f lr %.3e' % (step, _l, + _lr) + log.debug(msg) + log_writer.add_scalar('loss', _l, step=step) + log_writer.add_scalar('lr', _lr, step=step) + if step % 100 == 0: + acc = [] + with P.no_grad(): + model.eval() + for ids, sids, label in P.io.DataLoader( + dev_ds, places=P.CUDAPlace(0), + batch_size=None): + loss, logits = model(ids, sids, labels=label) + #print('\n'.join(map(str, logits.numpy().tolist()))) + a = (logits.argmax(-1) == label) + acc.append(a.numpy()) + model.train() + acc = np.concatenate(acc).mean() + log_writer.add_scalar('eval/acc', acc, step=step) + log.debug('acc %.5f' % acc) + if args.save_dir is not None: + P.save(model.state_dict(), args.save_dir / 'ckpt.bin') +if args.save_dir is not None: + P.save(model.state_dict(), args.save_dir / 'ckpt.bin') +if args.inference_model_dir is not None: + + class InferenceModel(ErnieModelForSequenceClassification): + def forward(self, ids, sids): + _, logits = super(InferenceModel, self).forward(ids, sids) + return logits + + model.__class__ = InferenceModel + log.debug('saving inference model') + src_placeholder = P.zeros([2, 2], dtype='int64') + sent_placehodler = P.zeros([2, 2], dtype='int64') + _, static = P.jit.TracedLayer.trace( + model, inputs=[src_placeholder, sent_placehodler]) + static.save_inference_model(str(args.inference_model_dir)) + + #class InferenceModel(ErnieModelForSequenceClassification): + # @P.jit.to_static + # def forward(self, ids, sids): + # _, logits = super(InferenceModel, self).forward(ids, sids, labels=None) + # return logits + #model.__class__ = InferenceModel + #src_placeholder = P.zeros([2, 2], dtype='int64') + #sent_placehodler = P.zeros([2, 2], dtype='int64') + #P.jit.save(model, args.inference_model_dir, input_var=[src_placeholder, sent_placehodler]) + log.debug('done') diff --git a/demo/finetune_classifier_distributed.py b/demo/finetune_classifier_distributed.py new file mode 100644 index 0000000..3dfa2f6 --- /dev/null +++ b/demo/finetune_classifier_distributed.py @@ -0,0 +1,205 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import time +import logging +import json +import re +from random import random +from functools import reduce, partial + +import numpy as np +import logging +#from visualdl import LogWriter + +from pathlib import Path +import paddle as P +from propeller import log +import propeller.paddle as propeller + +#from model.bert import BertConfig, BertModelLayer +from ernie.modeling_ernie import ErnieModel, ErnieModelForSequenceClassification +from ernie.tokenizing_ernie import ErnieTokenizer, ErnieTinyTokenizer +#from ernie.optimization import AdamW, LinearDecay +from demo.utils import create_if_not_exists, get_warmup_and_linear_decay + +log.setLevel(logging.DEBUG) +logging.getLogger().setLevel(logging.DEBUG) + +parser = propeller.ArgumentParser('classify model with ERNIE') +parser.add_argument( + '--from_pretrained', + type=Path, + required=True, + help='pretrained model directory or tag') +parser.add_argument( + '--max_seqlen', + type=int, + default=128, + help='max sentence length, should not greater than 512') +parser.add_argument('--bsz', type=int, default=32, help='batchsize') +parser.add_argument( + '--data_dir', + type=str, + required=True, + help='data directory includes train / develop data') +parser.add_argument( + '--max_steps', + type=int, + required=True, + help='max_train_steps, set this to EPOCH * NUM_SAMPLES / BATCH_SIZE') +parser.add_argument('--warmup_proportion', type=float, default=0.1) +parser.add_argument('--lr', type=float, default=5e-5, help='learning rate') +parser.add_argument( + '--save_dir', type=Path, required=True, help='model output directory') +parser.add_argument( + '--wd', type=float, default=0.01, help='weight decay, aka L2 regularizer') +parser.add_argument( + '--init_checkpoint', + type=str, + default=None, + help='checkpoint to warm start from') + +parser.add_argument( + '--use_amp', + action='store_true', + help='only activate AMP(auto mixed precision accelatoin) on TensorCore compatible devices' +) + +args = parser.parse_args() +env = P.distributed.ParallelEnv() + +tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained) +#tokenizer = ErnieTinyTokenizer.from_pretrained(args.from_pretrained) + +feature_column = propeller.data.FeatureColumns([ + propeller.data.TextColumn( + 'seg_a', + unk_id=tokenizer.unk_id, + vocab_dict=tokenizer.vocab, + tokenizer=tokenizer.tokenize), + propeller.data.TextColumn( + 'seg_b', + unk_id=tokenizer.unk_id, + vocab_dict=tokenizer.vocab, + tokenizer=tokenizer.tokenize), + propeller.data.LabelColumn( + 'label', vocab_dict={ + b"0": 0, + b"1": 1, + b"2": 2, + }), +]) + + +def map_fn(seg_a, seg_b, label): + seg_a, seg_b = tokenizer.truncate(seg_a, seg_b, seqlen=args.max_seqlen) + sentence, segments = tokenizer.build_for_ernie(seg_a, seg_b) + return sentence, segments, label + +train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), + shuffle=True, repeat=True, use_gz=False, shard=True) \ + .map(map_fn) \ + .padded_batch(args.bsz, (0, 0, 0)) + +dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), + shuffle=False, repeat=False, use_gz=False) \ + .map(map_fn) \ + .padded_batch(args.bsz, (0, 0, 0)) + +shapes = ([-1, args.max_seqlen], [-1, args.max_seqlen], [-1]) +types = ('int64', 'int64', 'int64') + +P.distributed.init_parallel_env() +model = ErnieModelForSequenceClassification.from_pretrained( + args.from_pretrained, num_labels=3, name='') + +if args.init_checkpoint is not None: + log.info('loading checkpoint from %s' % args.init_checkpoint) + sd, _ = P.load(args.init_checkpoint) + model.set_state_dict(sd) + +model = P.DataParallel(model) + +g_clip = P.nn.ClipGradByGlobalNorm(1.0) #experimental +param_name_to_exclue_from_weight_decay = re.compile( + r'.*layer_norm_scale|.*layer_norm_bias|.*b_0') + +lr_scheduler = P.optimizer.lr.LambdaDecay( + args.lr, + get_warmup_and_linear_decay(args.max_steps, + int(args.warmup_proportion * args.max_steps))) +opt = P.optimizer.AdamW( + learning_rate=lr_scheduler, + parameters=model.parameters(), + apply_decay_param_fun=lambda n: param_name_to_exclue_from_weight_decay.match(n), + weight_decay=args.wd, + grad_clip=g_clip) +scaler = P.amp.GradScaler(enable=args.use_amp) +step = 0 +create_if_not_exists(args.save_dir) + +#with LogWriter(logdir=str(create_if_not_exists(args.save_dir / 'vdl-%d' % env.dev_id))) as log_writer: +with P.amp.auto_cast(enable=args.use_amp): + for ids, sids, label in P.io.DataLoader( + train_ds, places=P.CUDAPlace(env.dev_id), batch_size=None): + step += 1 + loss, _ = model(ids, sids, labels=label) + loss = scaler.scale(loss) + loss.backward() + scaler.minimize(opt, loss) + model.clear_gradients() + lr_scheduler.step() + + # do logging + if step % 10 == 0: + _lr = lr_scheduler.get_lr() + if args.use_amp: + _l = (loss / scaler._scale).numpy() + msg = '[rank-%d][step-%d] train loss %.5f lr %.3e scaling %.3e' % ( + env.dev_id, step, _l, _lr, scaler._scale.numpy()) + else: + _l = loss.numpy() + msg = '[rank-%d][step-%d] train loss %.5f lr %.3e' % ( + env.dev_id, step, _l, _lr) + log.debug(msg) + #log_writer.add_scalar('loss', _l, step=step) + #log_writer.add_scalar('lr', _lr, step=step) + + # do saving + if step % 100 == 0 and env.dev_id == 0: + acc = [] + with P.no_grad(): + model.eval() + for d in P.io.DataLoader( + dev_ds, places=P.CUDAPlace(env.dev_id), + batch_size=None): + ids, sids, label = d + loss, logits = model(ids, sids, labels=label) + a = (logits.argmax(-1) == label) + acc.append(a.numpy()) + model.train() + acc = np.concatenate(acc).mean() + #log_writer.add_scalar('eval/acc', acc, step=step) + log.debug('acc %.5f' % acc) + if args.save_dir is not None: + P.save(model.state_dict(), args.save_dir / 'ckpt.bin') + # exit + if step > args.max_steps: + break + +if args.save_dir is not None and env.dev_id == 0: + P.save(model.state_dict(), args.save_dir / 'ckpt.bin') +log.debug('done') diff --git a/demo/finetune_classifier_dygraph.py b/demo/finetune_classifier_dygraph.py deleted file mode 100644 index d82eade..0000000 --- a/demo/finetune_classifier_dygraph.py +++ /dev/null @@ -1,157 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import re -import time -import logging -import json -from random import random -from tqdm import tqdm -from functools import reduce, partial - -import numpy as np -import logging -import argparse - -import paddle -import paddle.fluid as F -import paddle.fluid.dygraph as FD -import paddle.fluid.layers as L - -from propeller import log -import propeller.paddle as propeller - -log.setLevel(logging.DEBUG) -logging.getLogger().setLevel(logging.DEBUG) - - -#from model.bert import BertConfig, BertModelLayer -from ernie.modeling_ernie import ErnieModel, ErnieModelForSequenceClassification -from ernie.tokenizing_ernie import ErnieTokenizer, ErnieTinyTokenizer -from ernie.optimization import AdamW, LinearDecay - - -if __name__ == '__main__': - parser = argparse.ArgumentParser('classify model with ERNIE') - parser.add_argument('--from_pretrained', type=str, required=True, help='pretrained model directory or tag') - parser.add_argument('--max_seqlen', type=int, default=128, help='max sentence length, should not greater than 512') - parser.add_argument('--bsz', type=int, default=32, help='batchsize') - parser.add_argument('--epoch', type=int, default=3, help='epoch') - parser.add_argument('--data_dir', type=str, required=True, help='data directory includes train / develop data') - parser.add_argument('--use_lr_decay', action='store_true', help='if set, learning rate will decay to zero at `max_steps`') - parser.add_argument('--warmup_proportion', type=float, default=0.1, help='if use_lr_decay is set, ' - 'learning rate will raise to `lr` at `warmup_proportion` * `max_steps` and decay to 0. at `max_steps`') - parser.add_argument('--lr', type=float, default=5e-5, help='learning rate') - parser.add_argument('--inference_model_dir', type=str, default=None, help='inference model output directory') - parser.add_argument('--save_dir', type=str, default=None, help='model output directory') - parser.add_argument('--max_steps', type=int, default=None, help='max_train_steps, set this to EPOCH * NUM_SAMPLES / BATCH_SIZE') - parser.add_argument('--wd', type=float, default=0.01, help='weight decay, aka L2 regularizer') - parser.add_argument('--init_checkpoint', type=str, default=None, help='checkpoint to warm start from') - - - args = parser.parse_args() - - tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained) - #tokenizer = ErnieTinyTokenizer.from_pretrained(args.from_pretrained) - - feature_column = propeller.data.FeatureColumns([ - propeller.data.TextColumn('seg_a', unk_id=tokenizer.unk_id, vocab_dict=tokenizer.vocab, tokenizer=tokenizer.tokenize), - propeller.data.TextColumn('seg_b', unk_id=tokenizer.unk_id, vocab_dict=tokenizer.vocab, tokenizer=tokenizer.tokenize), - propeller.data.LabelColumn('label', vocab_dict={ - b"contradictory": 0, - b"contradiction": 0, - b"entailment": 1, - b"neutral": 2, - }), - ]) - - def map_fn(seg_a, seg_b, label): - seg_a, seg_b = tokenizer.truncate(seg_a, seg_b, seqlen=args.max_seqlen) - sentence, segments = tokenizer.build_for_ernie(seg_a, seg_b) - return sentence, segments, label - - - train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=True, repeat=False, use_gz=False) \ - .map(map_fn) \ - .padded_batch(args.bsz, (0, 0, 0)) - - dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \ - .map(map_fn) \ - .padded_batch(args.bsz, (0, 0, 0)) - - - shapes = ([-1, args.max_seqlen], [-1, args.max_seqlen], [-1]) - types = ('int64', 'int64', 'int64') - - train_ds.data_shapes = shapes - train_ds.data_types = types - dev_ds.data_shapes = shapes - dev_ds.data_types = types - - place = F.CUDAPlace(0) - with FD.guard(place): - model = ErnieModelForSequenceClassification.from_pretrained(args.from_pretrained, num_labels=3, name='') - - if args.init_checkpoint is not None: - log.info('loading checkpoint from %s' % args.init_checkpoint) - sd, _ = FD.load_dygraph(args.init_checkpoint) - model.set_dict(sd) - - g_clip = F.clip.GradientClipByGlobalNorm(1.0) #experimental - if args.use_lr_decay: - opt = AdamW(learning_rate=LinearDecay(args.lr, int(args.warmup_proportion * args.max_steps), args.max_steps), parameter_list=model.parameters(), weight_decay=args.wd, grad_clip=g_clip) - else: - opt = AdamW(args.lr, parameter_list=model.parameters(), weight_decay=args.wd, grad_clip=g_clip) - - for epoch in range(args.epoch): - for step, d in enumerate(tqdm(train_ds.start(place), desc='training')): - ids, sids, label = d - loss, _ = model(ids, sids, labels=label) - loss.backward() - if step % 10 == 0: - log.debug('train loss %.5f lr %.3e' % (loss.numpy(), opt.current_step_lr())) - opt.minimize(loss) - model.clear_gradients() - if step % 100 == 0: - acc = [] - with FD.base._switch_tracer_mode_guard_(is_train=False): - model.eval() - for step, d in enumerate(tqdm(dev_ds.start(place), desc='evaluating %d' % epoch)): - ids, sids, label = d - loss, logits = model(ids, sids, labels=label) - #print('\n'.join(map(str, logits.numpy().tolist()))) - a = L.argmax(logits, -1) == label - acc.append(a.numpy()) - model.train() - log.debug('acc %.5f' % np.concatenate(acc).mean()) - if args.save_dir is not None: - F.save_dygraph(model.state_dict(), args.save_dir) - if args.inference_model_dir is not None: - log.debug('saving inference model') - class InferemceModel(ErnieModelForSequenceClassification): - def forward(self, *args, **kwargs): - _, logits = super(InferemceModel, self).forward(*args, **kwargs) - return logits - model.__class__ = InferemceModel #dynamic change model type, to make sure forward output doesn't contain `None` - src_placeholder = FD.to_variable(np.ones([1, 1], dtype=np.int64)) - sent_placehodler = FD.to_variable(np.zeros([1, 1], dtype=np.int64)) - model(src_placeholder, sent_placehodler) - _, static_model = FD.TracedLayer.trace(model, inputs=[src_placeholder, sent_placehodler]) - static_model.save_inference_model(args.inference_model_dir) - log.debug('done') - - - - diff --git a/demo/finetune_classifier_dygraph_distributed.py b/demo/finetune_classifier_dygraph_distributed.py deleted file mode 100644 index 42e8eed..0000000 --- a/demo/finetune_classifier_dygraph_distributed.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import re -import time -import logging -import json -from random import random -from tqdm import tqdm -from functools import reduce, partial - -import numpy as np -import logging - -import paddle -import paddle.fluid as F -import paddle.fluid.dygraph as FD -import paddle.fluid.layers as L - -from propeller import log -import propeller.paddle as propeller - -log.setLevel(logging.DEBUG) -logging.getLogger().setLevel(logging.DEBUG) - - -#from model.bert import BertConfig, BertModelLayer -from ernie.modeling_ernie import ErnieModel, ErnieModelForSequenceClassification -from ernie.tokenizing_ernie import ErnieTokenizer, ErnieTinyTokenizer -from ernie.optimization import AdamW, LinearDecay - - -if __name__ == '__main__': - parser = propeller.ArgumentParser('classify model with ERNIE') - parser.add_argument('--from_pretrained', type=str, required=True, help='pretrained model directory or tag') - parser.add_argument('--max_seqlen', type=int, default=128, help='max sentence length, should not greater than 512') - parser.add_argument('--bsz', type=int, default=32, help='batchsize') - parser.add_argument('--data_dir', type=str, required=True, help='data directory includes train / develop data') - parser.add_argument('--max_steps', type=int, required=True, help='max_train_steps, set this to EPOCH * NUM_SAMPLES / BATCH_SIZE') - parser.add_argument('--warmup_proportion', type=float, default=0.1) - parser.add_argument('--lr', type=float, default=5e-5, help='learning rate') - parser.add_argument('--save_dir', type=str, default=None, help='model output directory') - parser.add_argument('--wd', type=int, default=0.01, help='weight decay, aka L2 regularizer') - parser.add_argument('--init_checkpoint', type=str, default=None, help='checkpoint to warm start from') - - args = parser.parse_args() - - tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained) - #tokenizer = ErnieTinyTokenizer.from_pretrained(args.from_pretrained) - - feature_column = propeller.data.FeatureColumns([ - propeller.data.TextColumn('seg_a', unk_id=tokenizer.unk_id, vocab_dict=tokenizer.vocab, tokenizer=tokenizer.tokenize), - propeller.data.TextColumn('seg_b', unk_id=tokenizer.unk_id, vocab_dict=tokenizer.vocab, tokenizer=tokenizer.tokenize), - propeller.data.LabelColumn('label', vocab_dict={ - b"0": 0, - b"1": 1, - b"2": 2, - }), - ]) - - def map_fn(seg_a, seg_b, label): - seg_a, seg_b = tokenizer.truncate(seg_a, seg_b, seqlen=args.max_seqlen) - sentence, segments = tokenizer.build_for_ernie(seg_a, seg_b) - return sentence, segments, label - - - train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=False, repeat=True, use_gz=False) \ - .map(map_fn) \ - .padded_batch(args.bsz, (0, 0, 0)) - train_ds = train_ds.shard(propeller.train.distribution.status.num_replica, propeller.train.distribution.status.replica_id) - log.debug('shard %d/%d'%(propeller.train.distribution.status.num_replica, propeller.train.distribution.status.replica_id)) - train_ds = train_ds.shuffle(10000) - - dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \ - .map(map_fn) \ - .padded_batch(args.bsz, (0, 0, 0)) - - - shapes = ([-1, args.max_seqlen], [-1, args.max_seqlen], [-1]) - types = ('int64', 'int64', 'int64') - - train_ds.data_shapes = shapes - train_ds.data_types = types - dev_ds.data_shapes = shapes - dev_ds.data_types = types - - place = F.CUDAPlace(FD.parallel.Env().dev_id) - with FD.guard(place): - ctx = FD.parallel.prepare_context() - model = ErnieModelForSequenceClassification.from_pretrained(args.from_pretrained, num_labels=3, name='') - - if args.init_checkpoint is not None: - log.info('loading checkpoint from %s' % args.init_checkpoint) - sd, _ = FD.load_dygraph(args.init_checkpoint) - model.set_dict(sd) - - model = FD.parallel.DataParallel(model, ctx) - - g_clip = F.clip.GradientClipByGlobalNorm(1.0) #experimental - opt = AdamW(learning_rate=LinearDecay( - args.lr, - int(args.warmup_proportion * args.max_steps), - args.max_steps), - parameter_list=model.parameters(), - weight_decay=args.wd, - grad_clip=g_clip) - - for step, d in enumerate(tqdm(train_ds.start(place), desc='training')): - ids, sids, label = d - loss, _ = model(ids, sids, labels=label) - scaled_loss = model.scale_loss(loss) - scaled_loss.backward() - model.apply_collective_grads() - opt.minimize(scaled_loss) - model.clear_gradients() - if step % 10 == 0: - log.debug('train loss %.5f, lr %.e3' % (loss.numpy(), opt.current_step_lr())) - if step % 100 == 0 and FD.parallel.Env().dev_id == 0: - acc = [] - with FD.base._switch_tracer_mode_guard_(is_train=False): - model.eval() - for step, d in enumerate(tqdm(dev_ds.start(place), desc='evaluating')): - ids, sids, label = d - loss, logits = model(ids, sids, labels=label) - #print('\n'.join(map(str, logits.numpy().tolist()))) - a = L.argmax(logits, -1) == label - acc.append(a.numpy()) - model.train() - log.debug('acc %.5f' % np.concatenate(acc).mean()) - if step > args.max_steps: - break - - if args.save_dir is not None: - F.save_dygraph(model.state_dict(), args.save_dir) diff --git a/demo/finetune_classifier_static.py b/demo/finetune_classifier_static.py new file mode 100644 index 0000000..0086cde --- /dev/null +++ b/demo/finetune_classifier_static.py @@ -0,0 +1,251 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import + +import os +import re +import time +import logging +from random import random +import json +from functools import reduce, partial + +import numpy as np +import multiprocessing +import tempfile +import re + +import paddle as P + +from ernie.modeling_ernie import ErnieModel, ErnieModelForSequenceClassification +from ernie.tokenizing_ernie import ErnieTokenizer, ErnieTinyTokenizer +from demo.optimization import optimization +#import utils.data + +from propeller import log +import propeller.paddle as propeller + +log.setLevel(logging.DEBUG) +logging.getLogger().setLevel(logging.DEBUG) + + +def model_fn(features, mode, params, run_config): + ernie = ErnieModelForSequenceClassification(params, name='') + if mode is not propeller.RunMode.TRAIN: + ernie.eval() + else: + ernie.train() + + metrics, loss = None, None + if mode is propeller.RunMode.PREDICT: + src_ids, sent_ids = features + _, logits = ernie(src_ids, sent_ids) + predictions = [logits, ] + else: + src_ids, sent_ids, labels = features + if mode is propeller.RunMode.EVAL: + loss, logits = ernie(src_ids, sent_ids, labels=labels) + pred = logits.argmax(axis=1) + acc = propeller.metrics.Acc(labels, pred) + metrics = {'acc': acc} + predictions = [pred] + train_hooks = None + else: + loss, logits = ernie(src_ids, sent_ids, labels=labels) + lr_step_hook, loss_scale_coef = optimization( + loss=loss, + warmup_steps=int(run_config.max_steps * + params['warmup_proportion']), + num_train_steps=run_config.max_steps, + learning_rate=params['learning_rate'], + train_program=P.static.default_main_program(), + startup_prog=P.static.default_startup_program(), + use_fp16=args.use_amp, + weight_decay=params['weight_decay'], + scheduler="linear_warmup_decay", ) + scheduled_lr = P.static.default_main_program().global_block().var( + 'learning_rate_0') + propeller.summary.scalar('lr', scheduled_lr) + predictions = [logits, ] + train_hooks = [lr_step_hook] + + return propeller.ModelSpec( + loss=loss, + mode=mode, + metrics=metrics, + predictions=predictions, + train_hooks=train_hooks) + + +if __name__ == '__main__': + parser = propeller.ArgumentParser('DAN model with Paddle') + parser.add_argument('--do_predict', action='store_true') + parser.add_argument('--max_seqlen', type=int, default=128) + parser.add_argument('--data_dir', type=str, required=True) + parser.add_argument('--from_pretrained', type=str, required=True) + parser.add_argument('--warm_start_from', type=str) + parser.add_argument('--epoch', type=int, default=3) + parser.add_argument('--use_amp', action='store_true') + + args = parser.parse_args() + + P.enable_static() + + if not os.path.exists(args.from_pretrained): + raise ValueError('--from_pretrained not found: %s' % + args.from_pretrained) + cfg_file_path = os.path.join(args.from_pretrained, 'ernie_config.json') + param_path = os.path.join(args.from_pretrained, 'params') + vocab_path = os.path.join(args.from_pretrained, 'vocab.txt') + + assert os.path.exists(cfg_file_path) and os.path.exists( + param_path) and os.path.exists(vocab_path) + + hparams_cli = propeller.parse_hparam(args) + hparams_config_file = json.loads(open(cfg_file_path).read()) + default_hparams = propeller.HParams( + batch_size=32, + num_labels=3, + warmup_proportion=0.1, + learning_rate=5e-5, + weight_decay=0.01, + use_task_id=False, + use_fp16=args.use_amp) + + hparams = default_hparams.join(propeller.HParams( + **hparams_config_file)).join(hparams_cli) + + default_run_config = dict( + max_steps=args.epoch * 390000 / hparams.batch_size, + save_steps=1000, + log_steps=10, + max_ckpt=1, + skip_steps=0, + model_dir=tempfile.mkdtemp(), + eval_steps=100) + run_config = dict(default_run_config, **json.loads(args.run_config)) + run_config = propeller.RunConfig(**run_config) + + tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained) + #tokenizer = ErnieTinyTokenizer.from_pretrained(args.from_pretrained) + unk_id = tokenizer.vocab['[UNK]'] + + shapes = ([-1, args.max_seqlen], [-1, args.max_seqlen], [-1]) + types = ('int64', 'int64', 'int64') + if not args.do_predict: + feature_column = propeller.data.FeatureColumns([ + propeller.data.TextColumn( + 'title', + unk_id=unk_id, + vocab_dict=tokenizer.vocab, + tokenizer=tokenizer.tokenize), + propeller.data.TextColumn( + 'comment', + unk_id=unk_id, + vocab_dict=tokenizer.vocab, + tokenizer=tokenizer.tokenize), + propeller.data.LabelColumn( + 'label', + vocab_dict={ + b"contradictory": 0, + b"contradiction": 0, + b"entailment": 1, + b"neutral": 2, + }), + ]) + + def map_fn(seg_a, seg_b, label): + seg_a, seg_b = tokenizer.truncate( + seg_a, seg_b, seqlen=args.max_seqlen) + sentence, segments = tokenizer.build_for_ernie(seg_a, seg_b) + #label = np.expand_dims(label, -1) # + return sentence, segments, label + + train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=True, repeat=True, use_gz=False) \ + .map(map_fn) \ + .padded_batch(hparams.batch_size) + + dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \ + .map(map_fn) \ + .padded_batch(hparams.batch_size) + + test_ds = feature_column.build_dataset('test', data_dir=os.path.join(args.data_dir, 'test'), shuffle=False, repeat=False, use_gz=False) \ + .map(map_fn) \ + .padded_batch(hparams.batch_size) \ + + train_ds.data_shapes = shapes + train_ds.data_types = types + dev_ds.data_shapes = shapes + dev_ds.data_types = types + test_ds.data_shapes = shapes + test_ds.data_types = types + + varname_to_warmstart = re.compile( + r'^encoder.*[wb]_0$|^.*embedding$|^.*bias$|^.*scale$|^pooled_fc.[wb]_0$' + ) + + ws = propeller.WarmStartSetting( + predicate_fn=lambda v: varname_to_warmstart.match(v.name) and os.path.exists(os.path.join(param_path, v.name)), + from_dir=param_path, + ) + + best_exporter = propeller.train.exporter.BestExporter( + os.path.join(run_config.model_dir, 'best'), + cmp_fn=lambda old, new: new['dev']['acc'] > old['dev']['acc']) + propeller.train.train_and_eval( + model_class_or_model_fn=model_fn, + params=hparams, + run_config=run_config, + train_dataset=train_ds, + eval_dataset={'dev': dev_ds, + 'test': test_ds}, + warm_start_setting=ws, + exporters=[best_exporter]) + + print('dev_acc3\t%.5f\ntest_acc3\t%.5f' % + (best_exporter._best['dev']['acc'], + best_exporter._best['test']['acc'])) + + else: + feature_column = propeller.data.FeatureColumns([ + propeller.data.TextColumn( + 'title', + unk_id=unk_id, + vocab_dict=tokenizer.vocab, + tokenizer=tokenizer.tokenize), + propeller.data.TextColumn( + 'comment', + unk_id=unk_id, + vocab_dict=tokenizer.vocab, + tokenizer=tokenizer.tokenize), + ]) + + def map_fn(seg_a, seg_b): + seg_a, seg_b = tokenizer.truncate( + seg_a, seg_b, seqlen=args.max_seqlen) + sentence, segments = tokenizer.build_for_ernie(seg_a, seg_b) + return sentence, segments + + + predict_ds = feature_column.build_dataset_from_stdin('predict') \ + .map(map_fn) \ + .padded_batch(hparams.batch_size) \ + + predict_ds.data_shapes = shapes[:-1] + predict_ds.data_types = types[:-1] + + est = propeller.Learner(model_fn, run_config, hparams) + for res, in est.predict(predict_ds, ckpt=-1): + print('%d\t%.5f\t%.5f\t%.5f' % + (np.argmax(res), res[0], res[1], res[2])) diff --git a/demo/finetune_mrc.py b/demo/finetune_mrc.py new file mode 100644 index 0000000..9b8ffe0 --- /dev/null +++ b/demo/finetune_mrc.py @@ -0,0 +1,247 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from __future__ import absolute_import +from __future__ import print_function +from __future__ import unicode_literals + +import os +import re +import time +import logging +import json +from pathlib import Path +from random import random +from tqdm import tqdm +from functools import reduce, partial +import pickle +import argparse +from functools import partial +from io import open + +import numpy as np +import logging + +import paddle as P + +from propeller import log +import propeller.paddle as propeller + +from ernie.modeling_ernie import ErnieModel, ErnieModelForQuestionAnswering +from ernie.tokenizing_ernie import ErnieTokenizer, ErnieTinyTokenizer +#from ernie.optimization import AdamW, LinearDecay + +from demo.mrc import mrc_reader +from demo.mrc import mrc_metrics +from demo.utils import create_if_not_exists, get_warmup_and_linear_decay + +log.setLevel(logging.DEBUG) +logging.getLogger().setLevel(logging.DEBUG) + + +def evaluate(model, ds, all_examples, all_features, tokenizer, args): + dev_file = json.loads(open(args.dev_file, encoding='utf8').read()) + with P.no_grad(): + log.debug('start eval') + model.eval() + all_res = [] + for step, (uids, token_ids, token_type_ids, _, __) in enumerate( + P.io.DataLoader( + ds, places=P.CUDAPlace(env.dev_id), batch_size=None)): + _, start_logits, end_logits = model(token_ids, token_type_ids) + res = [ + mrc_metrics.RawResult( + unique_id=u, start_logits=s, end_logits=e) + for u, s, e in zip(uids.numpy(), + start_logits.numpy(), end_logits.numpy()) + ] + all_res += res + open('all_res', 'wb').write(pickle.dumps(all_res)) + all_pred, all_nbests = mrc_metrics.make_results( + tokenizer, + all_examples, + all_features, + all_res, + n_best_size=args.n_best_size, + max_answer_length=args.max_answer_length, + do_lower_case=tokenizer.lower) + f1, em, _, __ = mrc_metrics.evaluate(dev_file, all_pred) + model.train() + log.debug('done eval') + return f1, em + + +def train(model, train_dataset, dev_dataset, dev_examples, dev_features, + tokenizer, args): + model = P.DataParallel(model) + + max_steps = len(train_features) * args.epoch // args.bsz + + g_clip = P.nn.ClipGradByGlobalNorm(1.0) #experimental + lr_scheduler = P.optimizer.lr.LambdaDecay( + args.lr, + get_warmup_and_linear_decay(max_steps, + int(args.warmup_proportion * max_steps))) + + opt = P.optimizer.AdamW( + lr_scheduler, + parameters=model.parameters(), + weight_decay=args.wd, + grad_clip=g_clip) + + train_dataset = train_dataset \ + .cache_shuffle_shard(env.nranks, env.dev_id, drop_last=True) \ + .padded_batch(args.bsz) + + log.debug('init training with args: %s' % repr(args)) + scaler = P.amp.GradScaler(enable=args.use_amp) + create_if_not_exists(args.save_dir) + + with P.amp.auto_cast(enable=args.use_amp): + for step, (_, token_ids, token_type_ids, start_pos, + end_pos) in enumerate( + P.io.DataLoader( + train_dataset, + places=P.CUDAPlace(env.dev_id), + batch_size=None)): + loss, _, __ = model( + token_ids, + token_type_ids, + start_pos=start_pos, + end_pos=end_pos) + loss = scaler.scale(loss) + loss.backward() + scaler.minimize(opt, loss) + model.clear_gradients() + lr_scheduler.step() + + if env.dev_id == 0 and step % 10 == 0: + _lr = lr_scheduler.get_lr() + if args.use_amp: + _l = (loss / scaler._scale).numpy() + msg = '[rank-%d][step-%d] train loss %.5f lr %.3e scaling %.3e' % ( + env.dev_id, step, _l, _lr, scaler._scale.numpy()) + else: + _l = loss.numpy() + msg = '[rank-%d][step-%d] train loss %.5f lr %.3e' % ( + env.dev_id, step, _l, _lr) + log.debug(msg) + + if env.dev_id == 0 and step % 100 == 0: + f1, em = evaluate(model, dev_dataset, dev_examples, + dev_features, tokenizer, args) + log.debug('[step %d] eval result: f1 %.5f em %.5f' % + (step, f1, em)) + if env.dev_id == 0 and args.save_dir is not None: + P.save(model.state_dict(), args.save_dir / 'ckpt.bin') + if step > max_steps: + break + + +if __name__ == "__main__": + parser = argparse.ArgumentParser('MRC model with ERNIE') + parser.add_argument( + '--from_pretrained', + type=Path, + required=True, + help='pretrained model directory or tag') + parser.add_argument( + '--max_seqlen', + type=int, + default=512, + help='max sentence length, should not greater than 512') + parser.add_argument('--bsz', type=int, default=8, help='batchsize') + parser.add_argument('--epoch', type=int, default=2, help='epoch') + parser.add_argument( + '--train_file', + type=str, + required=True, + help='data directory includes train / develop data') + parser.add_argument( + '--dev_file', + type=str, + required=True, + help='data directory includes train / develop data') + parser.add_argument('--warmup_proportion', type=float, default=0.1) + parser.add_argument('--lr', type=float, default=3e-5, help='learning rate') + parser.add_argument( + '--save_dir', type=Path, required=True, help='model output directory') + parser.add_argument( + '--n_best_size', type=int, default=20, help='nbest prediction to keep') + parser.add_argument( + '--max_answer_length', type=int, default=100, help='max answer span') + parser.add_argument( + '--wd', + type=float, + default=0.01, + help='weight decay, aka L2 regularizer') + parser.add_argument( + '--use_amp', + action='store_true', + help='only activate AMP(auto mixed precision accelatoin) on TensorCore compatible devices' + ) + + args = parser.parse_args() + + env = P.distributed.ParallelEnv() + P.distributed.init_parallel_env() + + tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained) + + if not os.path.exists(args.train_file): + raise RuntimeError('input data not found at %s' % args.train_file) + if not os.path.exists(args.dev_file): + raise RuntimeError('input data not found at %s' % args.dev_file) + + log.info('making train/dev data...') + train_examples = mrc_reader.read_files(args.train_file, is_training=True) + train_features = mrc_reader.convert_example_to_features( + train_examples, args.max_seqlen, tokenizer, is_training=True) + + dev_examples = mrc_reader.read_files(args.dev_file, is_training=False) + dev_features = mrc_reader.convert_example_to_features( + dev_examples, args.max_seqlen, tokenizer, is_training=False) + + log.info('train examples: %d, features: %d' % + (len(train_examples), len(train_features))) + + def map_fn(unique_id, example_index, doc_span_index, tokens, + token_to_orig_map, token_is_max_context, token_ids, + position_ids, text_type_ids, start_position, end_position): + if start_position is None: + start_position = 0 + if end_position is None: + end_position = 0 + return np.array(unique_id), np.array(token_ids), np.array( + text_type_ids), np.array(start_position), np.array(end_position) + + train_dataset = propeller.data.Dataset.from_list(train_features).map( + map_fn) + + dev_dataset = propeller.data.Dataset.from_list(dev_features).map( + map_fn).padded_batch(args.bsz) + + model = ErnieModelForQuestionAnswering.from_pretrained( + args.from_pretrained, name='') + + train(model, train_dataset, dev_dataset, dev_examples, dev_features, + tokenizer, args) + + if env.dev_id == 0: + f1, em = evaluate(model, dev_dataset, dev_examples, dev_features, + tokenizer, args) + log.debug('final eval result: f1 %.5f em %.5f' % (f1, em)) + if env.dev_id == 0 and args.save_dir is not None: + P.save(model.state_dict(), args.save_dir / 'ckpt.bin') diff --git a/demo/finetune_mrc_dygraph.py b/demo/finetune_mrc_dygraph.py deleted file mode 100644 index 13d1e8b..0000000 --- a/demo/finetune_mrc_dygraph.py +++ /dev/null @@ -1,176 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import division -from __future__ import absolute_import -from __future__ import print_function -from __future__ import unicode_literals - - -import os -import re -import time -import logging -import json -from random import random -from tqdm import tqdm -from functools import reduce, partial -import pickle -import argparse - -import numpy as np -import logging - -import paddle -import paddle.fluid as F -import paddle.fluid.dygraph as D -import paddle.fluid.layers as L - -from propeller import log -import propeller.paddle as propeller - -from ernie.modeling_ernie import ErnieModel, ErnieModelForQuestionAnswering -from ernie.tokenizing_ernie import ErnieTokenizer, ErnieTinyTokenizer -from ernie.optimization import AdamW, LinearDecay - -from demo.mrc import mrc_reader -from demo.mrc import mrc_metrics - -log.setLevel(logging.DEBUG) -logging.getLogger().setLevel(logging.DEBUG) - - -def evaluate(model, ds, all_examples, all_features, tokenizer, args): - dev_file = json.loads(open(args.dev_file).read()) - with D.base._switch_tracer_mode_guard_(is_train=False): - log.debug('start eval') - model.eval() - all_res = [] - for step, (uids, token_ids, token_type_ids, _, __) in enumerate(ds.start(place)): - _ , start_logits, end_logits = model(token_ids, token_type_ids) - res = [mrc_metrics.RawResult(unique_id=u, start_logits=s, end_logits=e) - for u, s, e in zip(uids.numpy(), start_logits.numpy(), end_logits.numpy())] - all_res += res - open('all_res', 'wb').write(pickle.dumps(all_res)) - all_pred, all_nbests = mrc_metrics.make_results( - tokenizer, - all_examples, - all_features, - all_res, - n_best_size=args.n_best_size, - max_answer_length=args.max_answer_length, - do_lower_case=tokenizer.lower) - f1, em, _, __ = mrc_metrics.evaluate(dev_file, all_pred) - model.train() - log.debug('done eval') - return f1, em - - -def train(model, train_dataset, dev_dataset, dev_examples, dev_features, tokenizer, args): - ctx = D.parallel.prepare_context() - model = D.parallel.DataParallel(model, ctx) - - max_steps = len(train_features) * args.epoch // args.bsz - g_clip = F.clip.GradientClipByGlobalNorm(1.0) #experimental - opt = AdamW(learning_rate=args.lr, parameter_list=model.parameters(), weight_decay=args.wd, grad_clip=g_clip) - - train_dataset = train_dataset \ - .repeat() \ - .shard(D.parallel.Env().nranks, D.parallel.Env().dev_id) \ - .shuffle(1000) \ - .padded_batch(args.bsz) - - log.debug('init training with args: %s' % repr(args)) - for step, (_, token_ids, token_type_ids, start_pos, end_pos) in enumerate(train_dataset.start(place)): - loss, _, __ = model(token_ids, token_type_ids, start_pos=start_pos, end_pos=end_pos) - scaled_loss = model.scale_loss(loss) - scaled_loss.backward() - model.apply_collective_grads() - opt.minimize(scaled_loss) - model.clear_gradients() - if D.parallel.Env().dev_id == 0 and step % 10 == 0: - log.debug('[step %d] train loss %.5f lr %.3e' % (step, loss.numpy(), opt.current_step_lr())) - if D.parallel.Env().dev_id == 0 and step % 100 == 0: - f1, em = evaluate(model, dev_dataset, dev_examples, dev_features, tokenizer, args) - log.debug('[step %d] eval result: f1 %.5f em %.5f' % (step, f1, em)) - if step > max_steps: - break - - -if __name__ == "__main__": - parser = argparse.ArgumentParser('MRC model with ERNIE') - parser.add_argument('--from_pretrained', type=str, required=True, help='pretrained model directory or tag') - parser.add_argument('--max_seqlen', type=int, default=512, help='max sentence length, should not greater than 512') - parser.add_argument('--bsz', type=int, default=8, help='batchsize') - parser.add_argument('--epoch', type=int, default=2, help='epoch') - parser.add_argument('--train_file', type=str, required=True, help='data directory includes train / develop data') - parser.add_argument('--dev_file', type=str, required=True, help='data directory includes train / develop data') - parser.add_argument('--warmup_proportion', type=float, default=0.1) - parser.add_argument('--lr', type=float, default=3e-5, help='learning rate') - parser.add_argument('--save_dir', type=str, default=None, help='model output directory') - parser.add_argument('--n_best_size', type=int, default=20, help='nbest prediction to keep') - parser.add_argument('--max_answer_length', type=int, default=100, help='max answer span') - parser.add_argument('--wd', type=float, default=0.00, help='weight decay, aka L2 regularizer') - - args = parser.parse_args() - - tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained) - - if not os.path.exists(args.train_file): - raise RuntimeError('input data not found at %s' % args.train_file) - if not os.path.exists(args.dev_file): - raise RuntimeError('input data not found at %s' % args.dev_file) - - log.info('making train/dev data...') - train_examples = mrc_reader.read_files(args.train_file, is_training=True) - train_features = mrc_reader.convert_example_to_features(train_examples, args.max_seqlen, tokenizer, is_training=True) - - dev_examples = mrc_reader.read_files(args.dev_file, is_training=False) - dev_features = mrc_reader.convert_example_to_features(dev_examples, args.max_seqlen, tokenizer, is_training=False) - - log.info('train examples: %d, features: %d' % (len(train_examples), len(train_features))) - - def map_fn(unique_id, example_index, doc_span_index, tokens, token_to_orig_map, token_is_max_context, token_ids, position_ids, text_type_ids, start_position, end_position): - if start_position is None: - start_position = 0 - if end_position is None: - end_position = 0 - return np.array(unique_id), np.array(token_ids), np.array(text_type_ids), np.array(start_position), np.array(end_position) - - train_dataset = propeller.data.Dataset.from_list(train_features).map(map_fn) - - dev_dataset = propeller.data.Dataset.from_list(dev_features).map(map_fn).padded_batch(args.bsz) - shapes = ([-1], [-1, args.max_seqlen], [-1, args.max_seqlen], [-1], [-1]) - types = ('int64', 'int64', 'int64', 'int64', 'int64') - - train_dataset.name = 'train' - dev_dataset.name = 'dev' - - train_dataset.data_shapes = shapes - train_dataset.data_types = types - dev_dataset.data_shapes = shapes - dev_dataset.data_types = types - - place = F.CUDAPlace(D.parallel.Env().dev_id) - D.guard(place).__enter__() - model = ErnieModelForQuestionAnswering.from_pretrained(args.from_pretrained, name='') - - train(model, train_dataset, dev_dataset, dev_examples, dev_features, tokenizer, args) - - if D.parallel.Env().dev_id == 0: - f1, em = evaluate(model, dev_dataset, dev_examples, dev_features, tokenizer, args) - log.debug('final eval result: f1 %.5f em %.5f' % (f1, em)) - if D.parallel.Env().dev_id == 0 and args.save_dir is not None: - F.save_dygraph(model.state_dict(), args.save_dir) - diff --git a/demo/finetune_ner.py b/demo/finetune_ner.py new file mode 100644 index 0000000..7489f16 --- /dev/null +++ b/demo/finetune_ner.py @@ -0,0 +1,258 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +import time +import logging +import six +import json +from random import random +from tqdm import tqdm +from collections import OrderedDict +from functools import reduce, partial +from pathlib import Path +from visualdl import LogWriter + +import numpy as np +import multiprocessing +import pickle +import logging + +from sklearn.metrics import f1_score +import paddle as P + +from propeller import log +import propeller.paddle as propeller + +log.setLevel(logging.DEBUG) +logging.getLogger().setLevel(logging.DEBUG) + +from demo.utils import create_if_not_exists, get_warmup_and_linear_decay +from ernie.modeling_ernie import ErnieModel, ErnieModelForSequenceClassification, ErnieModelForTokenClassification +from ernie.tokenizing_ernie import ErnieTokenizer +#from ernie.optimization import AdamW, LinearDecay + +parser = propeller.ArgumentParser('NER model with ERNIE') +parser.add_argument('--max_seqlen', type=int, default=256) +parser.add_argument('--bsz', type=int, default=32) +parser.add_argument('--data_dir', type=str, required=True) +parser.add_argument('--epoch', type=int, default=6) +parser.add_argument( + '--warmup_proportion', + type=float, + default=0.1, + help='if use_lr_decay is set, ' + 'learning rate will raise to `lr` at `warmup_proportion` * `max_steps` and decay to 0. at `max_steps`' +) +parser.add_argument( + '--max_steps', + type=int, + required=True, + help='max_train_steps, set this to EPOCH * NUM_SAMPLES / BATCH_SIZE, used in learning rate scheduler' +) +parser.add_argument( + '--use_amp', + action='store_true', + help='only activate AMP(auto mixed precision accelatoin) on TensorCore compatible devices' +) + +parser.add_argument('--from_pretrained', type=Path, required=True) +parser.add_argument('--lr', type=float, default=5e-5, help='learning rate') +parser.add_argument( + '--save_dir', type=Path, required=True, help='model output directory') +parser.add_argument( + '--wd', type=float, default=0.01, help='weight decay, aka L2 regularizer') +args = parser.parse_args() + +tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained) + + +def tokenizer_func(inputs): + ret = inputs.split(b'\2') + tokens, orig_pos = [], [] + for i, r in enumerate(ret): + t = tokenizer.tokenize(r) + for tt in t: + tokens.append(tt) + orig_pos.append(i) + assert len(tokens) == len(orig_pos) + return tokens + orig_pos + + +def tokenizer_func_for_label(inputs): + return inputs.split(b'\2') + + +feature_map = { + b"B-PER": 0, + b"I-PER": 1, + b"B-ORG": 2, + b"I-ORG": 3, + b"B-LOC": 4, + b"I-LOC": 5, + b"O": 6, +} +other_tag_id = feature_map[b'O'] + +feature_column = propeller.data.FeatureColumns([ + propeller.data.TextColumn( + 'text_a', + unk_id=tokenizer.unk_id, + vocab_dict=tokenizer.vocab, + tokenizer=tokenizer_func), propeller.data.TextColumn( + 'label', + unk_id=other_tag_id, + vocab_dict=feature_map, + tokenizer=tokenizer_func_for_label, ) +]) + + +def before(seg, label): + seg, orig_pos = np.split(seg, 2) + aligned_label = label[orig_pos] + seg, _ = tokenizer.truncate(seg, [], args.max_seqlen) + aligned_label, _ = tokenizer.truncate(aligned_label, [], args.max_seqlen) + orig_pos, _ = tokenizer.truncate(orig_pos, [], args.max_seqlen) + + sentence, segments = tokenizer.build_for_ernie( + seg + ) #utils.data.build_1_pair(seg, max_seqlen=args.max_seqlen, cls_id=cls_id, sep_id=sep_id) + aligned_label = np.concatenate([[0], aligned_label, [0]], 0) + orig_pos = np.concatenate([[0], orig_pos, [0]]) + + assert len(aligned_label) == len(sentence) == len(orig_pos), ( + len(aligned_label), len(sentence), len(orig_pos)) # alinged + return sentence, segments, aligned_label, label, orig_pos + +train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=True, repeat=False, use_gz=False) \ + .map(before) \ + .padded_batch(args.bsz, (0,0,-100, other_tag_id + 1, 0)) \ + +dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \ + .map(before) \ + .padded_batch(args.bsz, (0,0,-100, other_tag_id + 1,0)) \ + +test_ds = feature_column.build_dataset('test', data_dir=os.path.join(args.data_dir, 'test'), shuffle=False, repeat=False, use_gz=False) \ + .map(before) \ + .padded_batch(args.bsz, (0,0,-100, other_tag_id + 1,0)) \ + + +def evaluate(model, dataset): + model.eval() + with P.no_grad(): + chunkf1 = propeller.metrics.ChunkF1(None, None, None, len(feature_map)) + for step, (ids, sids, aligned_label, label, orig_pos + ) in enumerate(P.io.DataLoader( + dataset, batch_size=None)): + loss, logits = model(ids, sids) + #print('\n'.join(map(str, logits.numpy().tolist()))) + + assert orig_pos.shape[0] == logits.shape[0] == ids.shape[ + 0] == label.shape[0] + for pos, lo, la, id in zip(orig_pos.numpy(), + logits.numpy(), + label.numpy(), ids.numpy()): + _dic = OrderedDict() + assert len(pos) == len(lo) == len(id) + for _pos, _lo, _id in zip(pos, lo, id): + if _id > tokenizer.mask_id: # [MASK] is the largest special token + _dic.setdefault(_pos, []).append(_lo) + merged_lo = np.array( + [np.array(l).mean(0) for _, l in six.iteritems(_dic)]) + merged_preds = np.argmax(merged_lo, -1) + la = la[np.where(la != (other_tag_id + 1))] #remove pad + if len(la) > len(merged_preds): + log.warn( + 'accuracy loss due to truncation: label len:%d, truncate to %d' + % (len(la), len(merged_preds))) + merged_preds = np.pad(merged_preds, + [0, len(la) - len(merged_preds)], + mode='constant', + constant_values=7) + else: + assert len(la) == len( + merged_preds + ), 'expect label == prediction, got %d vs %d' % ( + la.shape, merged_preds.shape) + chunkf1.update((merged_preds, la, np.array(len(la)))) + #f1 = f1_score(np.concatenate(all_label), np.concatenate(all_pred), average='macro') + f1 = chunkf1.eval() + model.train() + return f1 + + +model = ErnieModelForTokenClassification.from_pretrained( + args.from_pretrained, + num_labels=len(feature_map), + name='', + has_pooler=False) + +g_clip = P.nn.ClipGradByGlobalNorm(1.0) #experimental +param_name_to_exclue_from_weight_decay = re.compile( + r'.*layer_norm_scale|.*layer_norm_bias|.*b_0') +lr_scheduler = P.optimizer.lr.LambdaDecay( + args.lr, + get_warmup_and_linear_decay(args.max_steps, + int(args.warmup_proportion * args.max_steps))) +opt = P.optimizer.AdamW( + lr_scheduler, + parameters=model.parameters(), + weight_decay=args.wd, + apply_decay_param_fun=lambda n: param_name_to_exclue_from_weight_decay.match(n), + grad_clip=g_clip) + +scaler = P.amp.GradScaler(enable=args.use_amp) +with LogWriter( + logdir=str(create_if_not_exists(args.save_dir / 'vdl'))) as log_writer: + with P.amp.auto_cast(enable=args.use_amp): + for epoch in range(args.epoch): + for step, ( + ids, sids, aligned_label, label, orig_pos + ) in enumerate(P.io.DataLoader( + train_ds, batch_size=None)): + loss, logits = model(ids, sids, labels=aligned_label) + #loss, logits = model(ids, sids, labels=aligned_label, loss_weights=P.cast(ids != 0, 'float32')) + loss = scaler.scale(loss) + loss.backward() + scaler.minimize(opt, loss) + model.clear_gradients() + lr_scheduler.step() + + if step % 10 == 0: + _lr = lr_scheduler.get_lr() + if args.use_amp: + _l = (loss / scaler._scale).numpy() + msg = '[step-%d] train loss %.5f lr %.3e scaling %.3e' % ( + step, _l, _lr, scaler._scale.numpy()) + else: + _l = loss.numpy() + msg = '[step-%d] train loss %.5f lr %.3e' % (step, _l, + _lr) + log.debug(msg) + log_writer.add_scalar('loss', _l, step=step) + log_writer.add_scalar('lr', _lr, step=step) + + if step % 100 == 0: + f1 = evaluate(model, dev_ds) + log.debug('eval f1: %.5f' % f1) + log_writer.add_scalar('eval/f1', f1, step=step) + if args.save_dir is not None: + P.save(model.state_dict(), args.save_dir / 'ckpt.bin') + +f1 = evaluate(model, dev_ds) +log.debug('final eval f1: %.5f' % f1) +log_writer.add_scalar('eval/f1', f1, step=step) +if args.save_dir is not None: + P.save(model.state_dict(), args.save_dir / 'ckpt.bin') diff --git a/demo/finetune_ner_dygraph.py b/demo/finetune_ner_dygraph.py deleted file mode 100644 index ad9376a..0000000 --- a/demo/finetune_ner_dygraph.py +++ /dev/null @@ -1,192 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import re -import time -import logging -import six -import json -from random import random -from tqdm import tqdm -from collections import OrderedDict -from functools import reduce, partial - -import numpy as np -import multiprocessing -import pickle -import logging - -from sklearn.metrics import f1_score -import paddle -import paddle.fluid as F -import paddle.fluid.dygraph as FD -import paddle.fluid.layers as L - -from propeller import log -import propeller.paddle as propeller - -log.setLevel(logging.DEBUG) -logging.getLogger().setLevel(logging.DEBUG) - -from ernie.modeling_ernie import ErnieModel, ErnieModelForSequenceClassification, ErnieModelForTokenClassification -from ernie.tokenizing_ernie import ErnieTokenizer -from ernie.optimization import AdamW, LinearDecay - - -if __name__ == '__main__': - parser = propeller.ArgumentParser('NER model with ERNIE') - parser.add_argument('--max_seqlen', type=int, default=256) - parser.add_argument('--bsz', type=int, default=32) - parser.add_argument('--data_dir', type=str, required=True) - parser.add_argument('--epoch', type=int, default=6) - parser.add_argument('--warmup_proportion', type=float, default=0.1, help='if use_lr_decay is set, ' - 'learning rate will raise to `lr` at `warmup_proportion` * `max_steps` and decay to 0. at `max_steps`') - parser.add_argument('--max_steps', type=int, required=True, - help='max_train_steps, set this to EPOCH * NUM_SAMPLES / BATCH_SIZE, used in learning rate scheduler') - parser.add_argument('--from_pretrained', type=str, required=True) - parser.add_argument('--lr', type=float, default=5e-5, help='learning rate') - parser.add_argument('--save_dir', type=str, default=None, help='model output directory') - parser.add_argument('--wd', type=float, default=0.01, help='weight decay, aka L2 regularizer') - args = parser.parse_args() - - tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained) - - - def tokenizer_func(inputs): - ret = inputs.split(b'\2') - tokens, orig_pos = [], [] - for i, r in enumerate(ret): - t = tokenizer.tokenize(r) - for tt in t: - tokens.append(tt) - orig_pos.append(i) - assert len(tokens) == len(orig_pos) - return tokens + orig_pos - - def tokenizer_func_for_label(inputs): - return inputs.split(b'\2') - - feature_map = { - b"B-PER": 0, - b"I-PER": 1, - b"B-ORG": 2, - b"I-ORG": 3, - b"B-LOC": 4, - b"I-LOC": 5, - b"O": 6, - } - other_tag_id = feature_map[b'O'] - - feature_column = propeller.data.FeatureColumns([ - propeller.data.TextColumn('text_a', unk_id=tokenizer.unk_id, vocab_dict=tokenizer.vocab, tokenizer=tokenizer_func), - propeller.data.TextColumn('label', unk_id=other_tag_id, vocab_dict=feature_map, - tokenizer=tokenizer_func_for_label,) - ]) - - def before(seg, label): - seg, orig_pos = np.split(seg, 2) - aligned_label = label[orig_pos] - seg, _ = tokenizer.truncate(seg, [], args.max_seqlen) - aligned_label, _ = tokenizer.truncate(aligned_label, [], args.max_seqlen) - orig_pos, _ = tokenizer.truncate(orig_pos, [], args.max_seqlen) - - sentence, segments = tokenizer.build_for_ernie(seg) #utils.data.build_1_pair(seg, max_seqlen=args.max_seqlen, cls_id=cls_id, sep_id=sep_id) - aligned_label = np.concatenate([[0], aligned_label, [0]], 0) - orig_pos = np.concatenate([[0], orig_pos, [0]]) - - assert len(aligned_label) == len(sentence) == len(orig_pos), (len(aligned_label), len(sentence), len(orig_pos)) # alinged - return sentence, segments, aligned_label, label, orig_pos - - train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=True, repeat=False, use_gz=False) \ - .map(before) \ - .padded_batch(args.bsz, (0,0,0, other_tag_id + 1, 0)) \ - - dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \ - .map(before) \ - .padded_batch(args.bsz, (0,0,0, other_tag_id + 1,0)) \ - - test_ds = feature_column.build_dataset('test', data_dir=os.path.join(args.data_dir, 'test'), shuffle=False, repeat=False, use_gz=False) \ - .map(before) \ - .padded_batch(args.bsz, (0,0,0, other_tag_id + 1,0)) \ - - - - shapes = ([-1, args.max_seqlen], [-1, args.max_seqlen], [-1, args.max_seqlen]) - types = ('int64', 'int64', 'int64') - - train_ds.data_shapes = shapes - train_ds.data_types = types - dev_ds.data_shapes = shapes - dev_ds.data_types = types - test_ds.data_shapes = shapes - test_ds.data_types = types - - place = F.CUDAPlace(0) - - @FD.no_grad - def evaluate(model, dataset): - model.eval() - chunkf1 = propeller.metrics.ChunkF1(None, None, None, len(feature_map)) - for step, (ids, sids, aligned_label, label, orig_pos) in enumerate(tqdm(dataset.start(place))): - loss, logits = model(ids, sids) - #print('\n'.join(map(str, logits.numpy().tolist()))) - - assert orig_pos.shape[0] == logits.shape[0] == ids.shape[0] == label.shape[0] - for pos, lo, la, id in zip(orig_pos.numpy(), logits.numpy(), label.numpy(), ids.numpy()): - _dic = OrderedDict() - assert len(pos) ==len(lo) == len(id) - for _pos, _lo, _id in zip(pos, lo, id): - if _id > tokenizer.mask_id: # [MASK] is the largest special token - _dic.setdefault(_pos, []).append(_lo) - merged_lo = np.array([np.array(l).mean(0) for _, l in six.iteritems(_dic)]) - merged_preds = np.argmax(merged_lo, -1) - la = la[np.where(la != (other_tag_id + 1))] #remove pad - if len(la) > len(merged_preds): - log.warn('accuracy loss due to truncation: label len:%d, truncate to %d' % (len(la), len(merged_preds))) - merged_preds = np.pad(merged_preds, [0, len(la) - len(merged_preds)], mode='constant', constant_values=7) - else: - assert len(la) == len(merged_preds), 'expect label == prediction, got %d vs %d' % (la.shape, merged_preds.shape) - chunkf1.update((merged_preds, la, np.array(len(la)))) - #f1 = f1_score(np.concatenate(all_label), np.concatenate(all_pred), average='macro') - f1 = chunkf1.eval() - model.train() - return f1 - with FD.guard(place): - model = ErnieModelForTokenClassification.from_pretrained(args.from_pretrained, num_labels=len(feature_map), name='', has_pooler=False) - - g_clip = F.clip.GradientClipByGlobalNorm(1.0) #experimental - opt = AdamW( - learning_rate=LinearDecay(args.lr, int(args.warmup_proportion * args.max_steps), args.max_steps), - parameter_list=model.parameters(), - weight_decay=args.wd, grad_clip=g_clip) - #opt = F.optimizer.AdamOptimizer(learning_rate=LinearDecay(args.lr, args.warmup_steps, args.max_steps), parameter_list=model.parameters()) - for epoch in range(args.epoch): - for step, (ids, sids, aligned_label, label, orig_pos) in enumerate(tqdm(train_ds.start(place))): - loss, logits = model(ids, sids, labels=aligned_label, loss_weights=L.cast(ids > tokenizer.mask_id, 'float32')) # [MASK] is the largest special token - loss.backward() - if step % 10 == 0 : - log.debug('train loss %.5f, lr %.3e' % (loss.numpy(), opt.current_step_lr())) - opt.minimize(loss) - model.clear_gradients() - if step % 100 == 0 : - f1 = evaluate(model, dev_ds) - log.debug('eval f1: %.5f' % f1) - - f1 = evaluate(model, dev_ds) - log.debug('final eval f1: %.5f' % f1) - if args.save_dir is not None: - F.save_dygraph(model.state_dict(), args.save_dir) - - diff --git a/demo/finetune_sentiment_analysis.py b/demo/finetune_sentiment_analysis.py new file mode 100644 index 0000000..7946198 --- /dev/null +++ b/demo/finetune_sentiment_analysis.py @@ -0,0 +1,206 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +import time +import logging +import json +from random import random +from tqdm import tqdm +from functools import reduce, partial +from pathlib import Path +from visualdl import LogWriter + +import numpy as np +import logging +import argparse + +import paddle as P + +from propeller import log +import propeller.paddle as propeller + +log.setLevel(logging.DEBUG) +logging.getLogger().setLevel(logging.DEBUG) +log = logging.getLogger() + +#from model.bert import BertConfig, BertModelLayer +from ernie.modeling_ernie import ErnieModel, ErnieModelForSequenceClassification +from ernie.tokenizing_ernie import ErnieTokenizer, ErnieTinyTokenizer +#from ernie.optimization import AdamW, LinearDecay +from demo.utils import create_if_not_exists, get_warmup_and_linear_decay + +parser = argparse.ArgumentParser('classify model with ERNIE') +parser.add_argument( + '--from_pretrained', + type=Path, + required=True, + help='pretrained model directory or tag') +parser.add_argument( + '--max_seqlen', + type=int, + default=128, + help='max sentence length, should not greater than 512') +parser.add_argument('--bsz', type=int, default=32, help='batchsize') +parser.add_argument('--epoch', type=int, default=3, help='epoch') +parser.add_argument( + '--data_dir', + type=str, + required=True, + help='data directory includes train / develop data') +parser.add_argument( + '--max_steps', + type=int, + required=True, + help='max_train_steps, set this to EPOCH * NUM_SAMPLES / BATCH_SIZE') +parser.add_argument('--warmup_proportion', type=float, default=0.1) +parser.add_argument('--lr', type=float, default=5e-5, help='learning rate') +parser.add_argument('--eval', action='store_true') +parser.add_argument( + '--save_dir', type=Path, required=True, help='model output directory') +parser.add_argument( + '--wd', type=float, default=0.01, help='weight decay, aka L2 regularizer') +parser.add_argument( + '--use_amp', + action='store_true', + help='only activate AMP(auto mixed precision accelatoin) on TensorCore compatible devices' +) + +args = parser.parse_args() + +tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained) +#tokenizer = ErnieTinyTokenizer.from_pretrained(args.from_pretrained) + +model = ErnieModelForSequenceClassification.from_pretrained( + args.from_pretrained, num_labels=3, name='') +if not args.eval: + feature_column = propeller.data.FeatureColumns([ + propeller.data.TextColumn( + 'seg_a', + unk_id=tokenizer.unk_id, + vocab_dict=tokenizer.vocab, + tokenizer=tokenizer.tokenize), + propeller.data.LabelColumn('label'), + ]) + + def map_fn(seg_a, label): + seg_a, _ = tokenizer.truncate(seg_a, [], seqlen=args.max_seqlen) + sentence, segments = tokenizer.build_for_ernie(seg_a, []) + return sentence, segments, label + + + train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=True, repeat=False, use_gz=False) \ + .map(map_fn) \ + .padded_batch(args.bsz) + + dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \ + .map(map_fn) \ + .padded_batch(args.bsz) + + g_clip = P.nn.ClipGradByGlobalNorm(1.0) #experimental + lr_scheduler = P.optimizer.lr.LambdaDecay( + args.lr, + get_warmup_and_linear_decay( + args.max_steps, int(args.warmup_proportion * args.max_steps))) + + param_name_to_exclue_from_weight_decay = re.compile( + r'.*layer_norm_scale|.*layer_norm_bias|.*b_0') + + opt = P.optimizer.AdamW( + lr_scheduler, + parameters=model.parameters(), + weight_decay=args.wd, + apply_decay_param_fun=lambda n: param_name_to_exclue_from_weight_decay.match(n), + grad_clip=g_clip) + scaler = P.amp.GradScaler(enable=args.use_amp) + with LogWriter(logdir=str(create_if_not_exists(args.save_dir / + 'vdl'))) as log_writer: + with P.amp.auto_cast(enable=args.use_amp): + for epoch in range(args.epoch): + for step, d in enumerate( + P.io.DataLoader( + train_ds, places=P.CUDAPlace(0), batch_size=None)): + ids, sids, label = d + loss, _ = model(ids, sids, labels=label) + loss = scaler.scale(loss) + loss.backward() + scaler.minimize(opt, loss) + model.clear_gradients() + lr_scheduler.step() + + if step % 10 == 0: + _lr = lr_scheduler.get_lr() + if args.use_amp: + _l = (loss / scaler._scale).numpy() + msg = '[step-%d] train loss %.5f lr %.3e scaling %.3e' % ( + step, _l, _lr, scaler._scale.numpy()) + else: + _l = loss.numpy() + msg = '[step-%d] train loss %.5f lr %.3e' % ( + step, _l, _lr) + log.debug(msg) + log_writer.add_scalar('loss', _l, step=step) + log_writer.add_scalar('lr', _lr, step=step) + + if step % 100 == 0: + acc = [] + with P.no_grad(): + model.eval() + for step, d in enumerate( + P.io.DataLoader( + dev_ds, + places=P.CUDAPlace(0), + batch_size=None)): + ids, sids, label = d + loss, logits = model(ids, sids, labels=label) + a = (logits.argmax(-1) == label) + acc.append(a.numpy()) + model.train() + acc = np.concatenate(acc).mean() + log_writer.add_scalar('eval/acc', acc, step=step) + log.debug('acc %.5f' % acc) + if args.save_dir is not None: + P.save(model.state_dict(), + args.save_dir / 'ckpt.bin') + if args.save_dir is not None: + P.save(model.state_dict(), args.save_dir / 'ckpt.bin') +else: + feature_column = propeller.data.FeatureColumns([ + propeller.data.TextColumn( + 'seg_a', + unk_id=tokenizer.unk_id, + vocab_dict=tokenizer.vocab, + tokenizer=tokenizer.tokenize), + ]) + + sd, _ = P.load(args.save_dir / 'ckpt.bin') + model.set_dict(sd) + model.eval() + + def map_fn(seg_a): + seg_a, _ = tokenizer.truncate(seg_a, [], seqlen=args.max_seqlen) + sentence, segments = tokenizer.build_for_ernie(seg_a, []) + return sentence, segments + + predict_ds = feature_column.build_dataset_from_stdin('predict') \ + .map(map_fn) \ + .padded_batch(args.bsz) + + for step, (ids, sids) in enumerate( + P.io.DataLoader( + predict_ds, places=P.CUDAPlace(0), batch_size=None)): + _, logits = model(ids, sids) + pred = logits.numpy().argmax(-1) + print('\n'.join(map(str, pred.tolist()))) diff --git a/demo/finetune_sentiment_analysis_dygraph.py b/demo/finetune_sentiment_analysis_dygraph.py deleted file mode 100644 index 894a55a..0000000 --- a/demo/finetune_sentiment_analysis_dygraph.py +++ /dev/null @@ -1,157 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import re -import time -import logging -import json -from random import random -from tqdm import tqdm -from functools import reduce, partial - -import numpy as np -import logging -import argparse - -import paddle -import paddle.fluid as F -import paddle.fluid.dygraph as FD -import paddle.fluid.layers as L - -from propeller import log -import propeller.paddle as propeller - -log.setLevel(logging.DEBUG) -logging.getLogger().setLevel(logging.DEBUG) -log = logging.getLogger() - - -#from model.bert import BertConfig, BertModelLayer -from ernie.modeling_ernie import ErnieModel, ErnieModelForSequenceClassification -from ernie.tokenizing_ernie import ErnieTokenizer, ErnieTinyTokenizer -from ernie.optimization import AdamW, LinearDecay - - -if __name__ == '__main__': - parser = argparse.ArgumentParser('classify model with ERNIE') - parser.add_argument('--from_pretrained', type=str, required=True, help='pretrained model directory or tag') - parser.add_argument('--max_seqlen', type=int, default=128, help='max sentence length, should not greater than 512') - parser.add_argument('--bsz', type=int, default=32, help='batchsize') - parser.add_argument('--epoch', type=int, default=3, help='epoch') - parser.add_argument('--data_dir', type=str, required=True, help='data directory includes train / develop data') - parser.add_argument('--max_steps', type=int, required=True, help='max_train_steps, set this to EPOCH * NUM_SAMPLES / BATCH_SIZE') - parser.add_argument('--warmup_proportion', type=float, default=0.1) - parser.add_argument('--lr', type=float, default=5e-5, help='learning rate') - parser.add_argument('--eval', action='store_true') - parser.add_argument('--save_dir', type=str, default=None, help='model output directory') - parser.add_argument('--wd', type=float, default=0.01, help='weight decay, aka L2 regularizer') - - - args = parser.parse_args() - - tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained) - #tokenizer = ErnieTinyTokenizer.from_pretrained(args.from_pretrained) - - place = F.CUDAPlace(0) - with FD.guard(place): - model = ErnieModelForSequenceClassification.from_pretrained(args.from_pretrained, num_labels=3, name='') - if not args.eval: - feature_column = propeller.data.FeatureColumns([ - propeller.data.TextColumn('seg_a', unk_id=tokenizer.unk_id, vocab_dict=tokenizer.vocab, tokenizer=tokenizer.tokenize), - propeller.data.LabelColumn('label'), - ]) - - def map_fn(seg_a, label): - seg_a, _ = tokenizer.truncate(seg_a, [], seqlen=args.max_seqlen) - sentence, segments = tokenizer.build_for_ernie(seg_a, []) - return sentence, segments, label - - - train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=True, repeat=False, use_gz=False) \ - .map(map_fn) \ - .padded_batch(args.bsz) - - dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \ - .map(map_fn) \ - .padded_batch(args.bsz) - - shapes = ([-1, args.max_seqlen], [-1, args.max_seqlen], [-1]) - types = ('int64', 'int64', 'int64') - - train_ds.data_shapes = shapes - train_ds.data_types = types - dev_ds.data_shapes = shapes - dev_ds.data_types = types - - g_clip = F.clip.GradientClipByGlobalNorm(1.0) #experimental - opt = AdamW(learning_rate=LinearDecay( - args.lr, - int(args.warmup_proportion * args.max_steps), args.max_steps), - parameter_list=model.parameters(), - weight_decay=args.wd, - grad_clip=g_clip) - - for epoch in range(args.epoch): - for step, d in enumerate(tqdm(train_ds.start(place), desc='training')): - ids, sids, label = d - loss, _ = model(ids, sids, labels=label) - loss.backward() - if step % 10 == 0: - log.debug('train loss %.5f lr %.3e' % (loss.numpy(), opt.current_step_lr())) - opt.minimize(loss) - model.clear_gradients() - if step % 100 == 0: - acc = [] - with FD.base._switch_tracer_mode_guard_(is_train=False): - model.eval() - for step, d in enumerate(tqdm(dev_ds.start(place), desc='evaluating %d' % epoch)): - ids, sids, label = d - loss, logits = model(ids, sids, labels=label) - #print('\n'.join(map(str, logits.numpy().tolist()))) - a = L.argmax(logits, -1) == label - acc.append(a.numpy()) - model.train() - log.debug('acc %.5f' % np.concatenate(acc).mean()) - if args.save_dir is not None: - F.save_dygraph(model.state_dict(), args.save_dir) - else: - feature_column = propeller.data.FeatureColumns([ - propeller.data.TextColumn('seg_a', unk_id=tokenizer.unk_id, vocab_dict=tokenizer.vocab, tokenizer=tokenizer.tokenize), - ]) - - assert args.save_dir is not None - sd, _ = FD.load_dygraph(args.save_dir) - model.set_dict(sd) - model.eval() - - def map_fn(seg_a): - seg_a, _ = tokenizer.truncate(seg_a, [], seqlen=args.max_seqlen) - sentence, segments = tokenizer.build_for_ernie(seg_a, []) - return sentence, segments - - predict_ds = feature_column.build_dataset_from_stdin('predict') \ - .map(map_fn) \ - .padded_batch(args.bsz) - shapes = ([-1, args.max_seqlen], [-1, args.max_seqlen]) - types = ('int64', 'int64') - predict_ds.data_shapes = shapes - predict_ds.data_types = types - - for step, (ids, sids) in enumerate(predict_ds.start(place)): - _, logits = model(ids, sids) - pred = logits.numpy().argmax(-1) - print('\n'.join(map(str, pred.tolist()))) - - diff --git a/demo/mrc/mrc_metrics.py b/demo/mrc/mrc_metrics.py index 2081488..a94859c 100644 --- a/demo/mrc/mrc_metrics.py +++ b/demo/mrc/mrc_metrics.py @@ -17,7 +17,6 @@ from __future__ import absolute_import from __future__ import print_function from __future__ import unicode_literals - import sys import re import six @@ -29,7 +28,8 @@ import nltk import unicodedata from collections import namedtuple -RawResult = namedtuple("RawResult", ["unique_id", "start_logits", "end_logits"]) +RawResult = namedtuple("RawResult", + ["unique_id", "start_logits", "end_logits"]) log = logging.getLogger(__name__) @@ -340,7 +340,7 @@ def _get_final_text(pred_text, orig_text, tokenizer): def make_results(vocab, all_examples, all_features, all_results, n_best_size, - max_answer_length, do_lower_case): + max_answer_length, do_lower_case): """Write final predictions to the json file and log-odds of null if needed.""" tokenizer = _BasicTokenizer(do_lower_case) example_index_to_features = collections.defaultdict(list) @@ -384,7 +384,8 @@ def make_results(vocab, all_examples, all_features, all_results, n_best_size, continue if end_index not in feature.token_to_orig_map: continue - if not feature.token_is_max_context.get(start_index, False): + if not feature.token_is_max_context.get(start_index, + False): continue if end_index < start_index: continue @@ -414,8 +415,8 @@ def make_results(vocab, all_examples, all_features, all_results, n_best_size, break feature = features[pred.feature_index] if pred.start_index > 0: # this is a non-null prediction - tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1 - )] + tok_tokens = feature.tokens[pred.start_index:(pred.end_index + + 1)] orig_doc_start = feature.token_to_orig_map[pred.start_index] orig_doc_end = feature.token_to_orig_map[pred.end_index] orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + @@ -483,9 +484,11 @@ def mixed_segmentation(in_str, rm_punc=False): in_str = in_str.lower().strip() segs_out = [] temp_str = "" - sp_char = ['-', ':', '_', '*', '^', '/', '\\', '~', '`', '+', '=', - ',', '。', ':', '?', '!', '“', '”', ';', '’', '《', '》', '……', '·', '、', - '「', '」', '(', ')', '-', '~', '『', '』'] + sp_char = [ + '-', ':', '_', '*', '^', '/', '\\', '~', '`', '+', '=', ',', '。', ':', + '?', '!', '“', '”', ';', '’', '《', '》', '……', '·', '、', '「', '」', '(', + ')', '-', '~', '『', '』' + ] for char in in_str: if rm_punc and char in sp_char: continue @@ -510,9 +513,11 @@ def mixed_segmentation(in_str, rm_punc=False): def remove_punctuation(in_str): """remove punctuation""" in_str = in_str.lower().strip() - sp_char = ['-', ':', '_', '*', '^', '/', '\\', '~', '`', '+', '=', - ',', '。', ':', '?', '!', '“', '”', ';', '’', '《', '》', '……', '·', '、', - '「', '」', '(', ')', '-', '~', '『', '』'] + sp_char = [ + '-', ':', '_', '*', '^', '/', '\\', '~', '`', '+', '=', ',', '。', ':', + '?', '!', '“', '”', ';', '’', '《', '》', '……', '·', '、', '「', '」', '(', + ')', '-', '~', '『', '』' + ] out_segs = [] for char in in_str: if char in sp_char: @@ -525,7 +530,7 @@ def remove_punctuation(in_str): # find longest common string def find_lcs(s1, s2): """find_lcs""" - m = [[0 for i in range(len(s2)+1)] for j in range(len(s1)+1)] + m = [[0 for i in range(len(s2) + 1)] for j in range(len(s1) + 1)] mmax = 0 p = 0 for i in range(len(s1)): @@ -535,7 +540,7 @@ def find_lcs(s1, s2): if m[i + 1][j + 1] > mmax: mmax = m[i + 1][j + 1] p = i + 1 - return s1[p - mmax: p], mmax + return s1[p - mmax:p], mmax def calc_f1_score(answers, prediction): @@ -548,9 +553,9 @@ def calc_f1_score(answers, prediction): if lcs_len == 0: f1_scores.append(0) continue - precision = 1.0 * lcs_len / len(prediction_segs) - recall = 1.0 * lcs_len / len(ans_segs) - f1 = (2 * precision * recall) / (precision + recall) + precision = 1.0 * lcs_len / len(prediction_segs) + recall = 1.0 * lcs_len / len(ans_segs) + f1 = (2 * precision * recall) / (precision + recall) f1_scores.append(f1) return max(f1_scores) @@ -578,20 +583,20 @@ def evaluate(ground_truth_file, prediction_file): context_text = instance['context'].strip() for qas in instance['qas']: total_count += 1 - query_id = qas['id'].strip() - query_text = qas['question'].strip() - answers = [ans["text"] for ans in qas["answers"]] + query_id = qas['id'].strip() + query_text = qas['question'].strip() + answers = [ans["text"] for ans in qas["answers"]] if query_id not in prediction_file: - sys.stderr.write('Unanswered question: {}\n'.format(query_id)) + sys.stderr.write('Unanswered question: {}\n'.format( + query_id)) skip_count += 1 continue - prediction = prediction_file[query_id] + prediction = prediction_file[query_id] f1 += calc_f1_score(answers, prediction) em += calc_em_score(answers, prediction) f1_score = f1 / total_count em_score = em / total_count return [f1_score, em_score, total_count, skip_count] - diff --git a/demo/mrc/mrc_reader.py b/demo/mrc/mrc_reader.py index f669261..999925e 100644 --- a/demo/mrc/mrc_reader.py +++ b/demo/mrc/mrc_reader.py @@ -20,33 +20,26 @@ from __future__ import unicode_literals import sys import argparse import logging +from functools import partial +from io import open + +open = partial(open, encoding='utf-8') import json from collections import namedtuple log = logging.getLogger(__name__) +Example = namedtuple('Example', [ + 'qas_id', 'question_text', 'doc_tokens', 'orig_answer_text', + 'start_position', 'end_position' +]) -Example = namedtuple('Example', - ['qas_id', - 'question_text', - 'doc_tokens', - 'orig_answer_text', - 'start_position', - 'end_position']) - -Feature = namedtuple("Feature", - ["unique_id", - "example_index", - "doc_span_index", - "tokens", - "token_to_orig_map", - "token_is_max_context", - "token_ids", - "position_ids", - "text_type_ids", - "start_position", - "end_position"]) +Feature = namedtuple("Feature", [ + "unique_id", "example_index", "doc_span_index", "tokens", + "token_to_orig_map", "token_is_max_context", "token_ids", "position_ids", + "text_type_ids", "start_position", "end_position" +]) def _tokenize_chinese_chars(text): @@ -113,7 +106,8 @@ def _check_is_max_context(doc_spans, cur_span_index, position): return cur_span_index == best_span_index -def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text): +def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, + orig_answer_text): """improve answer span""" tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) @@ -140,7 +134,7 @@ def read_files(input_file, is_training): start_pos = None end_pos = None orig_answer_text = None - + if is_training: if len(qa["answers"]) != 1: raise ValueError( @@ -151,17 +145,20 @@ def read_files(input_file, is_training): orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) - doc_tokens = [paragraph_text[:answer_offset], - paragraph_text[answer_offset: answer_offset + answer_length], - paragraph_text[answer_offset + answer_length:]] + doc_tokens = [ + paragraph_text[:answer_offset], paragraph_text[ + answer_offset:answer_offset + answer_length], + paragraph_text[answer_offset + answer_length:] + ] start_pos = 1 end_pos = 1 - actual_text = " ".join(doc_tokens[start_pos:(end_pos + 1)]) + actual_text = " ".join(doc_tokens[start_pos:(end_pos + + 1)]) if actual_text.find(orig_answer_text) == -1: log.info("Could not find answer: '%s' vs. '%s'", - actual_text, orig_answer_text) + actual_text, orig_answer_text) continue else: doc_tokens = _tokenize_chinese_chars(paragraph_text) @@ -177,7 +174,13 @@ def read_files(input_file, is_training): return examples -def convert_example_to_features(examples, max_seq_length, tokenizer, is_training, doc_stride=128, max_query_length=64): + +def convert_example_to_features(examples, + max_seq_length, + tokenizer, + is_training, + doc_stride=128, + max_query_length=64): """convert example to feature""" features = [] unique_id = 1000000000 @@ -185,7 +188,7 @@ def convert_example_to_features(examples, max_seq_length, tokenizer, is_training for (example_index, example) in enumerate(examples): query_tokens = tokenizer.tokenize(example.question_text) if len(query_tokens) > max_query_length: - query_tokens = query_tokens[0: max_query_length] + query_tokens = query_tokens[0:max_query_length] tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] @@ -202,7 +205,8 @@ def convert_example_to_features(examples, max_seq_length, tokenizer, is_training if is_training: tok_start_position = orig_to_tok_index[example.start_position] if example.end_position < len(example.doc_tokens) - 1: - tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 + tok_end_position = orig_to_tok_index[example.end_position + + 1] - 1 else: tok_end_position = len(all_doc_tokens) - 1 (tok_start_position, tok_end_position) = _improve_answer_span( @@ -297,4 +301,3 @@ if __name__ == "__main__": features = convert_example_to_features(examples, 512, tokenizer, True) log.debug(len(examples)) log.debug(len(features)) - diff --git a/demo/optimization.py b/demo/optimization.py new file mode 100644 index 0000000..9271ae6 --- /dev/null +++ b/demo/optimization.py @@ -0,0 +1,89 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +from __future__ import absolute_import + +import logging +import re + +import numpy as np +import paddle as P +import paddle.distributed.fleet as fleet +from propeller.paddle.train.hooks import RunHook + +log = logging.getLogger(__name__) + +from demo.utils import create_if_not_exists, get_warmup_and_linear_decay + + +def optimization( + loss, + warmup_steps, + num_train_steps, + learning_rate, + train_program, + startup_prog, + weight_decay, + scheduler='linear_warmup_decay', + use_fp16=False, ): + """do backword for static""" + + def exclude_from_weight_decay(param): + name = param.rstrip('.master') + if name.find("layer_norm") > -1: + return True + bias_suffix = ["_bias", "_b", ".b_0"] + for suffix in bias_suffix: + if name.endswith(suffix): + return True + return False + + g_clip = P.nn.ClipGradByGlobalNorm(1.0) + lr_scheduler = P.optimizer.lr.LambdaDecay( + learning_rate, + get_warmup_and_linear_decay(num_train_steps, warmup_steps)) + + optimizer = P.optimizer.AdamW( + learning_rate=lr_scheduler, + weight_decay=weight_decay, + grad_clip=g_clip, + apply_decay_param_fun=exclude_from_weight_decay) + + if use_fp16: + log.info('AMP activated') + if weight_decay > 0.: + raise ValueError( + 'paddle amp will ignore `weight_decay`, see https://github.com/PaddlePaddle/Paddle/issues/29794' + ) + #amp_list = P.fluid.contrib.mixed_precision.AutoMixedPrecisionLists( + # custom_white_list=['softmax', 'layer_norm', 'gelu']) + optimizer = P.fluid.contrib.mixed_precision.decorate( + optimizer, init_loss_scaling=3**15, use_dynamic_loss_scaling=True) + _, param_grads = optimizer.minimize(loss) + loss_scaling = P.static.default_main_program().global_block().var( + 'loss_scaling_0') + else: + _, param_grads = optimizer.minimize(loss) + loss_scaling = None + + class LRStepHook(RunHook): + def after_run(self, _, __): + lr_scheduler.step() + log.debug('lr step: %.5f' % lr_scheduler.get_lr()) + + return LRStepHook(), loss_scaling diff --git a/demo/pretrain/README.md b/demo/pretrain/README.md index df69d93..9a83618 100644 --- a/demo/pretrain/README.md +++ b/demo/pretrain/README.md @@ -4,7 +4,7 @@ only **mask word** strategy from [Ernie1.0](https://arxiv.org/pdf/1904.09223.pdf 1. make pretrain data -we use documents from multiple datasource (e.g. Wikipedia) to pretrain. +we use documents from multiple datasource (e.g. Wikipedia) to pretrain. input text should be segmented with space (even in chinese, this segmentation is used for *mask word*). each line corresonds to a *sentence*. empty line indicates end of document. @@ -41,4 +41,3 @@ python3 -m paddle.distributed.launch \ --from_pretrained /path/to/ernie1.0_pretrain_dir/ ``` - diff --git a/demo/pretrain/make_pretrain_data.py b/demo/pretrain/make_pretrain_data.py index 6f3d6ed..a77a71f 100644 --- a/demo/pretrain/make_pretrain_data.py +++ b/demo/pretrain/make_pretrain_data.py @@ -15,16 +15,21 @@ import io sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8') + def gen_segs(segment_piece): if len(segment_piece) == 0: return [] else: return [min(segment_piece)] * len(segment_piece) + whit_space_pat = re.compile(r'\S+') + + def segment(inputs, inputs_segment): ret = [r.span() for r in whit_space_pat.finditer(inputs)] - ret = [(inputs[s: e], gen_segs(inputs_segment[s: e])) for i, (s, e) in enumerate(ret)] + ret = [(inputs[s:e], gen_segs(inputs_segment[s:e])) + for i, (s, e) in enumerate(ret)] return ret @@ -36,11 +41,13 @@ def tokenize(sen, seg_info): sen = sen.lower() res_word, res_segments = [], [] for match in pat.finditer(sen): - words, pos = _wordpiece(match.group(0), vocab=vocab_set, unk_token='[UNK]') + words, pos = _wordpiece( + match.group(0), vocab=vocab_set, unk_token='[UNK]') start_of_word = match.span()[0] for w, p in zip(words, pos): res_word.append(w) - res_segments.append(gen_segs(seg_info[p[0] + start_of_word: p[1] + start_of_word])) + res_segments.append( + gen_segs(seg_info[p[0] + start_of_word:p[1] + start_of_word])) return res_word, res_segments @@ -63,22 +70,32 @@ def parse_txt(line): print('****', file=sys.stderr) ret_line = [vocab.get(r, vocab['[UNK]']) for r in ret_line] - ret_seginfo = [[-1] if i == [] else i for i in ret_seginfo] #for sentence piece only + ret_seginfo = [[-1] if i == [] else i + for i in ret_seginfo] #for sentence piece only ret_seginfo = [min(i) for i in ret_seginfo] return ret_line, ret_seginfo def build_example(slots): txt, seginfo = slots - txt_fe_list = feature_pb2.FeatureList(feature=[feature_pb2.Feature(int64_list=feature_pb2.Int64List(value=t)) for t in txt]) - segsinfo_fe_list = feature_pb2.FeatureList(feature=[feature_pb2.Feature(int64_list=feature_pb2.Int64List(value=s)) for s in seginfo]) - assert len(txt_fe_list.feature) == len(segsinfo_fe_list.feature), 'txt[%d] and seginfo[%d] size not match' % (len(txt_fe_list.feature), len(segsinfo_fe_list.feature)) + txt_fe_list = feature_pb2.FeatureList(feature=[ + feature_pb2.Feature(int64_list=feature_pb2.Int64List(value=t)) + for t in txt + ]) + segsinfo_fe_list = feature_pb2.FeatureList(feature=[ + feature_pb2.Feature(int64_list=feature_pb2.Int64List(value=s)) + for s in seginfo + ]) + assert len(txt_fe_list.feature) == len( + segsinfo_fe_list.feature), 'txt[%d] and seginfo[%d] size not match' % ( + len(txt_fe_list.feature), len(segsinfo_fe_list.feature)) features = { - 'txt': txt_fe_list, + 'txt': txt_fe_list, 'segs': segsinfo_fe_list, } - ex = example_pb2.SequenceExample(feature_lists=feature_pb2.FeatureLists(feature_list=features)) + ex = example_pb2.SequenceExample(feature_lists=feature_pb2.FeatureLists( + feature_list=features)) return ex @@ -122,15 +139,17 @@ if __name__ == '__main__': args = parser.parse_args() log.setLevel(logging.DEBUG) - from ernie.tokenizing_ernie import _wordpiece pat = re.compile(r'([a-zA-Z0-9]+|\S)') - vocab = {j.strip().split(b'\t')[0].decode('utf8'): i for i, j in enumerate(open(args.vocab, 'rb'))} + vocab = { + j.strip().split(b'\t')[0].decode('utf8'): i + for i, j in enumerate(open(args.vocab, 'rb')) + } vocab_set = set(vocab.keys()) - - with open(args.src, 'rb') as from_file, gzip.open(args.tgt, 'wb') as to_file: + with open(args.src, 'rb') as from_file, gzip.open(args.tgt, + 'wb') as to_file: log.info('making gz from bb %s ==> %s' % (from_file, to_file)) build_bb(from_file, to_file) log.info('done: %s' % to_file) diff --git a/demo/pretrain/pretrain.py b/demo/pretrain/pretrain.py index d3ba1c9..c1521b9 100644 --- a/demo/pretrain/pretrain.py +++ b/demo/pretrain/pretrain.py @@ -24,12 +24,11 @@ import re import logging import six from glob import glob +from pathlib import Path from functools import reduce, partial import itertools -import paddle -import paddle.fluid as F -import paddle.fluid.layers as L +import paddle as P import sentencepiece as spm import json @@ -39,12 +38,13 @@ import random as r from ernie.modeling_ernie import ErnieModelForPretraining from ernie.tokenizing_ernie import ErnieTokenizer -from ernie.optimization import optimization +#from ernie.optimization import AdamW, LinearDecay +import propeller as propeller_base import propeller.paddle as propeller from propeller.paddle.data import Dataset - from propeller import log +from demo.utils import create_if_not_exists, get_warmup_and_linear_decay log.setLevel(logging.DEBUG) logging.getLogger().setLevel(logging.DEBUG) @@ -53,6 +53,7 @@ if six.PY3: from itertools import accumulate else: import operator + def accumulate(iterable, func=operator.add, initial=None): 'Return running totals' # accumulate([1,2,3,4,5]) --> 1 3 6 10 15 @@ -71,43 +72,10 @@ else: yield total -def ernie_pretrain_model_fn(features, mode, params, run_config): - """propeller Model wraper for paddle-ERNIE """ - src_ids, sent_ids, mlm_label, mask_pos, nsp_label = features - - ernie = ErnieModelForPretraining(params, name='') - total_loss, mlm_loss, nsp_loss = ernie(src_ids, sent_ids, labels=mlm_label, mlm_pos=mask_pos, nsp_labels=nsp_label) - - metrics = None - inf_spec = None - - propeller.summary.scalar('loss', total_loss) - propeller.summary.scalar('nsp-loss', nsp_loss) - propeller.summary.scalar('mlm-loss', mlm_loss) - - scheduled_lr, loss_scale_coef = optimization( - loss=total_loss, - warmup_steps=params['warmup_steps'], - num_train_steps=run_config.max_steps, - learning_rate=params['learning_rate'], - train_program=F.default_main_program(), - startup_prog=F.default_startup_program(), - weight_decay=params['weight_decay'], - scheduler="linear_warmup_decay", - use_fp16=params['use_fp16'], - ) - - propeller.summary.scalar('lr', scheduled_lr) - if params['use_fp16']: - propeller.summary.scalar('loss_scale', loss_scale_coef) - pred = [total_loss] - - return propeller.ModelSpec(loss=total_loss, mode=mode, metrics=metrics, predictions=pred) - - def truncate_sentence(seq, from_length, to_length): - random_begin = np.random.randint(0, np.maximum(0, from_length - to_length) + 1) - return seq[random_begin: random_begin + to_length] + random_begin = np.random.randint( + 0, np.maximum(0, from_length - to_length) + 1) + return seq[random_begin:random_begin + to_length] def build_pair(seg_a, seg_b, max_seqlen, vocab): @@ -119,9 +87,11 @@ def build_pair(seg_a, seg_b, max_seqlen, vocab): ml = max_seqlen - 3 half_ml = ml // 2 if a_len > b_len: - a_len_truncated, b_len_truncated = np.maximum(half_ml, ml - b_len), np.minimum(half_ml, b_len) + a_len_truncated, b_len_truncated = np.maximum( + half_ml, ml - b_len), np.minimum(half_ml, b_len) else: - a_len_truncated, b_len_truncated = np.minimum(half_ml, a_len), np.maximum(half_ml, ml - a_len) + a_len_truncated, b_len_truncated = np.minimum( + half_ml, a_len), np.maximum(half_ml, ml - a_len) seg_a = truncate_sentence(seg_a, a_len, a_len_truncated) seg_b = truncate_sentence(seg_b, b_len, b_len_truncated) @@ -131,9 +101,11 @@ def build_pair(seg_a, seg_b, max_seqlen, vocab): token_type_a = np.ones_like(seg_a_txt, dtype=np.int64) * 0 token_type_b = np.ones_like(seg_b_txt, dtype=np.int64) * 1 - sen_emb = np.concatenate([[cls_id], seg_a_txt, [sep_id], seg_b_txt, [sep_id]], 0) + sen_emb = np.concatenate( + [[cls_id], seg_a_txt, [sep_id], seg_b_txt, [sep_id]], 0) info_emb = np.concatenate([[-1], seg_a_info, [-1], seg_b_info, [-1]], 0) - token_type_emb = np.concatenate([[0], token_type_a, [0], token_type_b, [1]], 0) + token_type_emb = np.concatenate( + [[0], token_type_a, [0], token_type_b, [1]], 0) return sen_emb, info_emb, token_type_emb @@ -145,24 +117,25 @@ def apply_mask(sentence, seg_info, mask_rate, vocab_size, vocab): batch_size, seqlen = shape invalid_pos = np.where(seg_info == -1) - seg_info += 1 #no more =1 + seg_info += 1 #no more =1 seg_info_flatten = seg_info.reshape([-1]) seg_info_incr = seg_info_flatten - np.roll(seg_info_flatten, shift=1) - seg_info = np.add.accumulate(np.array([0 if s == 0 else 1 for s in seg_info_incr])).reshape(shape) + seg_info = np.add.accumulate( + np.array([0 if s == 0 else 1 for s in seg_info_incr])).reshape(shape) seg_info[invalid_pos] = -1 u_seginfo = np.array([i for i in np.unique(seg_info) if i != -1]) np.random.shuffle(u_seginfo) sample_num = max(1, int(len(u_seginfo) * mask_rate)) - u_seginfo = u_seginfo[: sample_num] + u_seginfo = u_seginfo[:sample_num] mask = reduce(np.logical_or, [seg_info == i for i in u_seginfo]) - mask[:, 0] = False # ignore CLS head + mask[:, 0] = False # ignore CLS head rand = np.random.rand(*shape) - choose_original = rand < 0.1 # - choose_random_id = (0.1 < rand) & (rand < 0.2) # - choose_mask_id = 0.2 < rand # + choose_original = rand < 0.1 # + choose_random_id = (0.1 < rand) & (rand < 0.2) # + choose_mask_id = 0.2 < rand # random_id = np.random.randint(1, vocab_size, size=shape) replace_id = mask_id * choose_mask_id + \ @@ -172,30 +145,39 @@ def apply_mask(sentence, seg_info, mask_rate, vocab_size, vocab): mask_pos = np.where(mask) #mask_pos_flatten = list(map(lambda idx: idx[0] * seqlen + idx[1], zip(*mask_pos))) #transpose mask_label = sentence[mask_pos] - sentence[mask_pos] = replace_id[mask_pos] #overwrite + sentence[mask_pos] = replace_id[mask_pos] #overwrite #log.debug(mask_pos_flatten) return sentence, np.stack(mask_pos, -1), mask_label -def make_pretrain_dataset(name, dir, vocab, hparams, args): +def make_pretrain_dataset(name, dir, vocab, args): gz_files = glob(dir) if not gz_files: - raise ValueError('train data not found in %s' % dir) + raise ValueError('train data not found in %s' % gz_files) log.info('read from %s' % '\n'.join(gz_files)) - max_input_seqlen = args.max_seqlen - max_pretrain_seqlen = lambda: max_input_seqlen if r.random() > 0.15 else r.randint(1, max_input_seqlen) # short sentence rate + max_input_seqlen = args.max_seqlen + max_pretrain_seqlen = lambda: max_input_seqlen if r.random() > 0.15 else r.randint(1, max_input_seqlen) # short sentence rate - def _parse_gz(record_str): # function that takes python_str as input - ex = propeller.data.example_pb2.SequenceExample() + def _parse_gz(record_str): # function that takes python_str as input + ex = propeller_base.data.example_pb2.SequenceExample() ex.ParseFromString(record_str) - doc = [np.array(f.int64_list.value, dtype=np.int64) for f in ex.feature_lists.feature_list['txt'].feature] - doc_seg = [np.array(f.int64_list.value, dtype=np.int64) for f in ex.feature_lists.feature_list['segs'].feature] + doc = [ + np.array( + f.int64_list.value, dtype=np.int64) + for f in ex.feature_lists.feature_list['txt'].feature + ] + doc_seg = [ + np.array( + f.int64_list.value, dtype=np.int64) + for f in ex.feature_lists.feature_list['segs'].feature + ] return doc, doc_seg def bb_to_segments(filename): ds = Dataset.from_record_file(filename).map(_parse_gz) iterable = iter(ds) + def gen(): buf, size = [], 0 iterator = iter(ds) @@ -205,7 +187,9 @@ def make_pretrain_dataset(name, dir, vocab, hparams, args): #line = np.array(sp_model.SampleEncodeAsIds(line, -1, 0.1), dtype=np.int64) # 0.1 means large variance on sentence piece result if len(line) == 0: continue - line = np.array(line) # 0.1 means large variance on sentence piece result + line = np.array( + line + ) # 0.1 means large variance on sentence piece result line_seg = np.array(line_seg) size += len(line) buf.append(np.stack([line, line_seg]).transpose()) @@ -213,8 +197,9 @@ def make_pretrain_dataset(name, dir, vocab, hparams, args): yield buf, buf, size = [], 0 if len(buf) != 0: - yield buf, + yield buf, buf, size = [], 0 + return Dataset.from_generator_func(gen) def sample_negative(dataset): @@ -228,10 +213,13 @@ def make_pretrain_dataset(name, dir, vocab, hparams, args): seqlen_a = r.randint(1, seqlen) seqlen_b = seqlen - seqlen_a len_a = list(accumulate([len(c) for c in chunk_a])) - buf_a = [c for c, l in zip(chunk_a, len_a) if l < seqlen_a] #always take the first one - buf_b = [c for c, l in zip(chunk_a, len_a) if seqlen_a <= l < seqlen] + buf_a = [c for c, l in zip(chunk_a, len_a) + if l < seqlen_a] #always take the first one + buf_b = [ + c for c, l in zip(chunk_a, len_a) if seqlen_a <= l < seqlen + ] - if r.random() < 0.5: #pos or neg + if r.random() < 0.5: #pos or neg label = np.int64(1) else: label = np.int64(0) @@ -243,7 +231,9 @@ def make_pretrain_dataset(name, dir, vocab, hparams, args): b = np.concatenate(buf_b) #log.debug(a) #log.debug(b) - sample, seg_info, token_type = build_pair(a, b, args.max_seqlen, vocab) #negative sample might exceed max seqlen + sample, seg_info, token_type = build_pair( + a, b, args.max_seqlen, + vocab) #negative sample might exceed max seqlen yield sample, seg_info, token_type, label ds = propeller.data.Dataset.from_generator_func(gen) @@ -251,14 +241,20 @@ def make_pretrain_dataset(name, dir, vocab, hparams, args): def after(sentence, seg_info, segments, label): batch_size, seqlen = sentence.shape - sentence, mask_pos, mlm_label = apply_mask(sentence, seg_info, args.mask_rate, hparams.vocab_size, vocab) + sentence, mask_pos, mlm_label = apply_mask(sentence, seg_info, + args.mask_rate, + len(vocab), vocab) ra = r.random() if ra < args.check: print('***') - print('\n'.join([str(j) + '\t' + '|'.join(map(str, i)) for i, j in zip(sentence.tolist(), label)])) + print('\n'.join([ + str(j) + '\t' + '|'.join(map(str, i)) + for i, j in zip(sentence.tolist(), label) + ])) print('***') - print('\n'.join(['|'.join(map(str, i)) for i in seg_info.tolist()])) + print('\n'.join( + ['|'.join(map(str, i)) for i in seg_info.tolist()])) print('***') print('|'.join(map(str, mlm_label.tolist()))) print('***') @@ -269,13 +265,21 @@ def make_pretrain_dataset(name, dir, vocab, hparams, args): dataset = Dataset.from_list(gz_files) if propeller.train.distribution.status.mode == propeller.train.distribution.DistributionMode.NCCL: log.info('Apply sharding in distribution env') - dataset = dataset.shard(propeller.train.distribution.status.num_replica, propeller.train.distribution.status.replica_id) + if len(gz_files) < propeller.train.distribution.status.num_replica: + raise ValueError( + 'not enough train file to shard: # of train files: %d, # of workers %d' + % (len(gz_files), + propeller.train.distribution.status.num_replica)) + dataset = dataset.shard(env.nranks, env.dev_id) + dataset = dataset.repeat().shuffle(buffer_size=len(gz_files)) - dataset = dataset.interleave(map_fn=bb_to_segments, cycle_length=len(gz_files), block_length=1) - dataset = dataset.shuffle(buffer_size=1000) #must shuffle to ensure negative sample randomness + dataset = dataset.interleave( + map_fn=bb_to_segments, cycle_length=len(gz_files), block_length=1) + dataset = dataset.shuffle( + buffer_size=1000) #must shuffle to ensure negative sample randomness dataset = sample_negative(dataset) - dataset = dataset.padded_batch(hparams.batch_size, (0, 0, 0, 0)).map(after) + dataset = dataset.padded_batch(args.bsz, (0, 0, 0, 0)).map(after) dataset.name = name return dataset @@ -287,68 +291,110 @@ if __name__ == '__main__': sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8') parser = propeller.ArgumentParser('DAN model with Paddle') - parser.add_argument('--max_seqlen', type=int, default=256) - parser.add_argument('--data_dir', type=str, required=True) - parser.add_argument('--from_pretrained', type=str, default=None) - parser.add_argument('--mask_rate', type=float, default=0.15) - parser.add_argument('--check', type=float, default=0.) - - args = parser.parse_args() - - if not os.path.exists(args.from_pretrained): - raise ValueError('--from_pretrained not found: %s' % args.from_pretrained) - cfg_file_path = os.path.join(args.from_pretrained, 'ernie_config.json') - param_path = os.path.join(args.from_pretrained, 'params') - vocab_path = os.path.join(args.from_pretrained, 'vocab.txt') - assert os.path.exists(cfg_file_path) and os.path.exists(param_path) and os.path.exists(vocab_path) - - - hparams_cli = propeller.parse_hparam(args) - hparams_config_file = json.loads(open(cfg_file_path).read()) - default_hparams = propeller.HParams( - batch_size=50, - warmup_steps=10000, - learning_rate=1e-4, - weight_decay=0.01, - use_fp16=False, + parser.add_argument( + '--max_seqlen', + type=int, + default=256, + help='max sequence length, documents from pretrain data will expand to this length' + ) + parser.add_argument( + '--data_dir', + type=str, + required=True, + help='protobuf pretrain data directory') + parser.add_argument( + '--mask_rate', + type=float, + default=0.15, + help='probability of input token tobe masked') + parser.add_argument( + '--check', type=float, default=0., help='probability of debug info') + parser.add_argument( + '--warmup_steps', type=int, default=10000, help='warmups steps') + parser.add_argument( + '--max_steps', type=int, default=1000000, help='max pretrian steps') + parser.add_argument('--lr', type=float, default=1e-4, help='learning_rate') + parser.add_argument( + '--from_pretrained', + type=Path, + required=True, + help='pretraind model dir') + parser.add_argument( + '--save_dir', type=Path, required=True, help='model output_dir') + parser.add_argument( + '--wd', + type=float, + default=0.01, + help='weight decay, aka L2 regularizer') + parser.add_argument('--bsz', type=int, default=50) + parser.add_argument( + '--use_amp', + action='store_true', + help='only activate AMP(auto mixed precision accelatoin) on TensorCore compatible devices' ) - hparams = default_hparams.join(propeller.HParams(**hparams_config_file)).join(hparams_cli) - - default_run_config=dict( - max_steps=1000000, - save_steps=10000, - log_steps=10, - max_ckpt=3, - skip_steps=0, - eval_steps=-1) + args = parser.parse_args() - run_config = dict(default_run_config, **json.loads(args.run_config)) - run_config = propeller.RunConfig(**run_config) + P.distributed.init_parallel_env() + env = P.distributed.ParallelEnv() tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained) - - train_ds = make_pretrain_dataset('train', args.data_dir, - vocab=tokenizer.vocab, hparams=hparams, args=args) - - seq_shape = [-1, args.max_seqlen] - ints_shape = [-1,] - shapes = (seq_shape, seq_shape, ints_shape, [-1, 2], ints_shape) - types = ('int64', 'int64', 'int64', 'int64', 'int64') - - train_ds.data_shapes = shapes - train_ds.data_types = types - ws = None - - #varname_to_warmstart = re.compile(r'^encoder.*[wb]_0$|^.*embedding$|^.*bias$|^.*scale$|^pooled_fc.[wb]_0$') - varname_to_warmstart = re.compile(r'.*') - if args.from_pretrained is not None: - warm_start_dir = os.path.join(args.from_pretrained, 'params') - ws = propeller.WarmStartSetting( - predicate_fn=lambda v: varname_to_warmstart.match(v.name) and os.path.exists(os.path.join(warm_start_dir, v.name)), - from_dir=warm_start_dir - ) - - ernie_learner = propeller.Learner(ernie_pretrain_model_fn, run_config, params=hparams, warm_start_setting=ws) - ernie_learner.train(train_ds) + train_ds = make_pretrain_dataset( + 'train', args.data_dir, vocab=tokenizer.vocab, args=args) + + model = ErnieModelForPretraining.from_pretrained(args.from_pretrained) + + param_name_to_exclue_from_weight_decay = re.compile( + r'.*layer_norm_scale|.*layer_norm_bias|.*b_0') + + lr_scheduler = P.optimizer.lr.LambdaDecay( + args.lr, + get_warmup_and_linear_decay(args.max_steps, args.warmup_steps)) + g_clip = P.nn.ClipGradByGlobalNorm(1.0) #experimental + + opt = P.optimizer.AdamW( + learning_rate=lr_scheduler, + parameters=model.parameters(), + apply_decay_param_fun=lambda n: param_name_to_exclue_from_weight_decay.match(n), + weight_decay=args.wd, + grad_clip=g_clip) + + model = P.DataParallel(model) + + scaler = P.amp.GradScaler(enable=args.use_amp) + create_if_not_exists(args.save_dir) + with P.amp.auto_cast(args.use_amp): + for step, samples in enumerate( + P.io.DataLoader( + train_ds, places=P.CUDAPlace(env.dev_id), batch_size=0)): + (src_ids, sent_ids, mlm_label, mask_pos, nsp_label) = samples + loss, mlmloss, nsploss = model( + src_ids, + sent_ids, + labels=mlm_label, + mlm_pos=mask_pos, + nsp_labels=nsp_label) + loss = scaler.scale(loss) + loss.backward() + scaler.minimize(opt, loss) + model.clear_gradients() + lr_scheduler.step() + + if step % 10 == 0: + _lr = lr_scheduler.get_lr() + if args.use_amp: + _l = (loss / scaler._scale).numpy() + msg = '[rank-%d][step-%d] train loss %.5f lr %.3e scaling %.3e' % ( + env.dev_id, step, _l, _lr, scaler._scale.numpy()) + else: + _l = loss.numpy() + msg = '[rank-%d][step-%d] train loss %.5f lr %.3e' % ( + env.dev_id, step, _l, _lr) + log.debug(msg) + if step % 1000 == 0 and env.dev_id == 0: + log.debug('saveing...') + P.save(model.state_dict(), args.save_dir / 'ckpt.bin') + if step > args.max_steps: + break + log.info('done') diff --git a/demo/pretrain/pretrain_dygraph.py b/demo/pretrain/pretrain_static.py similarity index 50% rename from demo/pretrain/pretrain_dygraph.py rename to demo/pretrain/pretrain_static.py index 3911ad9..57497c7 100644 --- a/demo/pretrain/pretrain_dygraph.py +++ b/demo/pretrain/pretrain_static.py @@ -24,14 +24,11 @@ import re import logging import six from glob import glob +from pathlib import Path from functools import reduce, partial import itertools -import paddle -import paddle.fluid as F -import paddle.fluid.dygraph as D -import paddle.fluid.layers as L -import sentencepiece as spm +import paddle as P import json from tqdm import tqdm @@ -40,9 +37,10 @@ import random as r from ernie.modeling_ernie import ErnieModelForPretraining from ernie.tokenizing_ernie import ErnieTokenizer -from ernie.optimization import AdamW, LinearDecay +from demo.optimization import optimization import propeller.paddle as propeller +import propeller as propeller_base from propeller.paddle.data import Dataset from propeller import log @@ -54,6 +52,7 @@ if six.PY3: from itertools import accumulate else: import operator + def accumulate(iterable, func=operator.add, initial=None): 'Return running totals' # accumulate([1,2,3,4,5]) --> 1 3 6 10 15 @@ -72,9 +71,54 @@ else: yield total +def ernie_pretrain_model_fn(features, mode, params, run_config): + """propeller Model wraper for paddle-ERNIE """ + src_ids, sent_ids, mlm_label, mask_pos, nsp_label = features + + ernie = ErnieModelForPretraining(params, name='') + total_loss, mlm_loss, nsp_loss = ernie( + src_ids, + sent_ids, + labels=mlm_label, + mlm_pos=mask_pos, + nsp_labels=nsp_label) + + metrics = None + inf_spec = None + + propeller.summary.scalar('loss', total_loss) + propeller.summary.scalar('nsp-loss', nsp_loss) + propeller.summary.scalar('mlm-loss', mlm_loss) + + lr_step_hook, loss_scale_coef = optimization( + loss=total_loss, + warmup_steps=params['warmup_steps'], + num_train_steps=run_config.max_steps, + learning_rate=params['learning_rate'], + train_program=P.static.default_main_program(), + startup_prog=P.static.default_startup_program(), + weight_decay=params['weight_decay'], + scheduler="linear_warmup_decay", + use_fp16=args.use_amp, ) + scheduled_lr = P.static.default_main_program().global_block().var( + 'learning_rate_0') + propeller.summary.scalar('lr', scheduled_lr) + if args.use_amp: + propeller.summary.scalar('loss_scaling', loss_scale_coef) + pred = [total_loss] + + return propeller.ModelSpec( + loss=total_loss, + mode=mode, + metrics=metrics, + predictions=pred, + train_hooks=[lr_step_hook]) + + def truncate_sentence(seq, from_length, to_length): - random_begin = np.random.randint(0, np.maximum(0, from_length - to_length) + 1) - return seq[random_begin: random_begin + to_length] + random_begin = np.random.randint( + 0, np.maximum(0, from_length - to_length) + 1) + return seq[random_begin:random_begin + to_length] def build_pair(seg_a, seg_b, max_seqlen, vocab): @@ -86,9 +130,11 @@ def build_pair(seg_a, seg_b, max_seqlen, vocab): ml = max_seqlen - 3 half_ml = ml // 2 if a_len > b_len: - a_len_truncated, b_len_truncated = np.maximum(half_ml, ml - b_len), np.minimum(half_ml, b_len) + a_len_truncated, b_len_truncated = np.maximum( + half_ml, ml - b_len), np.minimum(half_ml, b_len) else: - a_len_truncated, b_len_truncated = np.minimum(half_ml, a_len), np.maximum(half_ml, ml - a_len) + a_len_truncated, b_len_truncated = np.minimum( + half_ml, a_len), np.maximum(half_ml, ml - a_len) seg_a = truncate_sentence(seg_a, a_len, a_len_truncated) seg_b = truncate_sentence(seg_b, b_len, b_len_truncated) @@ -98,9 +144,11 @@ def build_pair(seg_a, seg_b, max_seqlen, vocab): token_type_a = np.ones_like(seg_a_txt, dtype=np.int64) * 0 token_type_b = np.ones_like(seg_b_txt, dtype=np.int64) * 1 - sen_emb = np.concatenate([[cls_id], seg_a_txt, [sep_id], seg_b_txt, [sep_id]], 0) + sen_emb = np.concatenate( + [[cls_id], seg_a_txt, [sep_id], seg_b_txt, [sep_id]], 0) info_emb = np.concatenate([[-1], seg_a_info, [-1], seg_b_info, [-1]], 0) - token_type_emb = np.concatenate([[0], token_type_a, [0], token_type_b, [1]], 0) + token_type_emb = np.concatenate( + [[0], token_type_a, [0], token_type_b, [1]], 0) return sen_emb, info_emb, token_type_emb @@ -112,24 +160,25 @@ def apply_mask(sentence, seg_info, mask_rate, vocab_size, vocab): batch_size, seqlen = shape invalid_pos = np.where(seg_info == -1) - seg_info += 1 #no more =1 + seg_info += 1 #no more =1 seg_info_flatten = seg_info.reshape([-1]) seg_info_incr = seg_info_flatten - np.roll(seg_info_flatten, shift=1) - seg_info = np.add.accumulate(np.array([0 if s == 0 else 1 for s in seg_info_incr])).reshape(shape) + seg_info = np.add.accumulate( + np.array([0 if s == 0 else 1 for s in seg_info_incr])).reshape(shape) seg_info[invalid_pos] = -1 u_seginfo = np.array([i for i in np.unique(seg_info) if i != -1]) np.random.shuffle(u_seginfo) sample_num = max(1, int(len(u_seginfo) * mask_rate)) - u_seginfo = u_seginfo[: sample_num] + u_seginfo = u_seginfo[:sample_num] mask = reduce(np.logical_or, [seg_info == i for i in u_seginfo]) - mask[:, 0] = False # ignore CLS head + mask[:, 0] = False # ignore CLS head rand = np.random.rand(*shape) - choose_original = rand < 0.1 # - choose_random_id = (0.1 < rand) & (rand < 0.2) # - choose_mask_id = 0.2 < rand # + choose_original = rand < 0.1 # + choose_random_id = (0.1 < rand) & (rand < 0.2) # + choose_mask_id = 0.2 < rand # random_id = np.random.randint(1, vocab_size, size=shape) replace_id = mask_id * choose_mask_id + \ @@ -139,30 +188,39 @@ def apply_mask(sentence, seg_info, mask_rate, vocab_size, vocab): mask_pos = np.where(mask) #mask_pos_flatten = list(map(lambda idx: idx[0] * seqlen + idx[1], zip(*mask_pos))) #transpose mask_label = sentence[mask_pos] - sentence[mask_pos] = replace_id[mask_pos] #overwrite + sentence[mask_pos] = replace_id[mask_pos] #overwrite #log.debug(mask_pos_flatten) return sentence, np.stack(mask_pos, -1), mask_label -def make_pretrain_dataset(name, dir, vocab, args): +def make_pretrain_dataset(name, dir, vocab, hparams, args): gz_files = glob(dir) if not gz_files: - raise ValueError('train data not found in %s' % gz_files) + raise ValueError('train data not found in %s' % dir) log.info('read from %s' % '\n'.join(gz_files)) - max_input_seqlen = args.max_seqlen - max_pretrain_seqlen = lambda: max_input_seqlen if r.random() > 0.15 else r.randint(1, max_input_seqlen) # short sentence rate + max_input_seqlen = args.max_seqlen + max_pretrain_seqlen = lambda: max_input_seqlen if r.random() > 0.15 else r.randint(1, max_input_seqlen) # short sentence rate - def _parse_gz(record_str): # function that takes python_str as input - ex = propeller.data.example_pb2.SequenceExample() + def _parse_gz(record_str): # function that takes python_str as input + ex = propeller_base.data.example_pb2.SequenceExample() ex.ParseFromString(record_str) - doc = [np.array(f.int64_list.value, dtype=np.int64) for f in ex.feature_lists.feature_list['txt'].feature] - doc_seg = [np.array(f.int64_list.value, dtype=np.int64) for f in ex.feature_lists.feature_list['segs'].feature] + doc = [ + np.array( + f.int64_list.value, dtype=np.int64) + for f in ex.feature_lists.feature_list['txt'].feature + ] + doc_seg = [ + np.array( + f.int64_list.value, dtype=np.int64) + for f in ex.feature_lists.feature_list['segs'].feature + ] return doc, doc_seg def bb_to_segments(filename): ds = Dataset.from_record_file(filename).map(_parse_gz) iterable = iter(ds) + def gen(): buf, size = [], 0 iterator = iter(ds) @@ -172,7 +230,9 @@ def make_pretrain_dataset(name, dir, vocab, args): #line = np.array(sp_model.SampleEncodeAsIds(line, -1, 0.1), dtype=np.int64) # 0.1 means large variance on sentence piece result if len(line) == 0: continue - line = np.array(line) # 0.1 means large variance on sentence piece result + line = np.array( + line + ) # 0.1 means large variance on sentence piece result line_seg = np.array(line_seg) size += len(line) buf.append(np.stack([line, line_seg]).transpose()) @@ -180,8 +240,9 @@ def make_pretrain_dataset(name, dir, vocab, args): yield buf, buf, size = [], 0 if len(buf) != 0: - yield buf, + yield buf, buf, size = [], 0 + return Dataset.from_generator_func(gen) def sample_negative(dataset): @@ -195,10 +256,13 @@ def make_pretrain_dataset(name, dir, vocab, args): seqlen_a = r.randint(1, seqlen) seqlen_b = seqlen - seqlen_a len_a = list(accumulate([len(c) for c in chunk_a])) - buf_a = [c for c, l in zip(chunk_a, len_a) if l < seqlen_a] #always take the first one - buf_b = [c for c, l in zip(chunk_a, len_a) if seqlen_a <= l < seqlen] + buf_a = [c for c, l in zip(chunk_a, len_a) + if l < seqlen_a] #always take the first one + buf_b = [ + c for c, l in zip(chunk_a, len_a) if seqlen_a <= l < seqlen + ] - if r.random() < 0.5: #pos or neg + if r.random() < 0.5: #pos or neg label = np.int64(1) else: label = np.int64(0) @@ -210,7 +274,9 @@ def make_pretrain_dataset(name, dir, vocab, args): b = np.concatenate(buf_b) #log.debug(a) #log.debug(b) - sample, seg_info, token_type = build_pair(a, b, args.max_seqlen, vocab) #negative sample might exceed max seqlen + sample, seg_info, token_type = build_pair( + a, b, args.max_seqlen, + vocab) #negative sample might exceed max seqlen yield sample, seg_info, token_type, label ds = propeller.data.Dataset.from_generator_func(gen) @@ -218,14 +284,19 @@ def make_pretrain_dataset(name, dir, vocab, args): def after(sentence, seg_info, segments, label): batch_size, seqlen = sentence.shape - sentence, mask_pos, mlm_label = apply_mask(sentence, seg_info, args.mask_rate, len(vocab), vocab) + sentence, mask_pos, mlm_label = apply_mask( + sentence, seg_info, args.mask_rate, hparams.vocab_size, vocab) ra = r.random() if ra < args.check: print('***') - print('\n'.join([str(j) + '\t' + '|'.join(map(str, i)) for i, j in zip(sentence.tolist(), label)])) + print('\n'.join([ + str(j) + '\t' + '|'.join(map(str, i)) + for i, j in zip(sentence.tolist(), label) + ])) print('***') - print('\n'.join(['|'.join(map(str, i)) for i in seg_info.tolist()])) + print('\n'.join( + ['|'.join(map(str, i)) for i in seg_info.tolist()])) print('***') print('|'.join(map(str, mlm_label.tolist()))) print('***') @@ -236,15 +307,17 @@ def make_pretrain_dataset(name, dir, vocab, args): dataset = Dataset.from_list(gz_files) if propeller.train.distribution.status.mode == propeller.train.distribution.DistributionMode.NCCL: log.info('Apply sharding in distribution env') - if len(gz_files) < propeller.train.distribution.status.num_replica: - raise ValueError('not enough train file to shard: # of train files: %d, # of workers %d' % (len(gz_files), propeller.train.distribution.status.num_replica)) - dataset = dataset.shard(propeller.train.distribution.status.num_replica, propeller.train.distribution.status.replica_id) + dataset = dataset.shard( + propeller.train.distribution.status.num_replica, + propeller.train.distribution.status.replica_id) dataset = dataset.repeat().shuffle(buffer_size=len(gz_files)) - dataset = dataset.interleave(map_fn=bb_to_segments, cycle_length=len(gz_files), block_length=1) - dataset = dataset.shuffle(buffer_size=1000) #must shuffle to ensure negative sample randomness + dataset = dataset.interleave( + map_fn=bb_to_segments, cycle_length=len(gz_files), block_length=1) + dataset = dataset.shuffle( + buffer_size=1000) #must shuffle to ensure negative sample randomness dataset = sample_negative(dataset) - dataset = dataset.padded_batch(args.bsz, (0, 0, 0, 0)).map(after) + dataset = dataset.padded_batch(hparams.batch_size, (0, 0, 0, 0)).map(after) dataset.name = name return dataset @@ -256,53 +329,77 @@ if __name__ == '__main__': sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8') parser = propeller.ArgumentParser('DAN model with Paddle') - parser.add_argument('--max_seqlen', type=int, default=256, help='max sequence length, documents from pretrain data will expand to this length') - parser.add_argument('--data_dir', type=str, required=True, help='protobuf pretrain data directory') - parser.add_argument('--mask_rate', type=float, default=0.15, help='probability of input token tobe masked') - parser.add_argument('--check', type=float, default=0., help='probability of debug info') - parser.add_argument('--warmup_steps', type=int, default=10000, help='warmups steps') - parser.add_argument('--max_steps', type=int, default=1000000, help='max pretrian steps') - parser.add_argument('--lr', type=float, default=1e-4, help='learning_rate') - parser.add_argument('--from_pretrained', type=str, required=True, help='pretraind model dir') - parser.add_argument('--save_dir', type=str, default=None, help='model output_dir') - parser.add_argument('--bsz', type=int, default=50) - + parser.add_argument('--max_seqlen', type=int, default=256) + parser.add_argument('--data_dir', type=str, required=True) + parser.add_argument('--from_pretrained', type=Path, default=None) + parser.add_argument('--use_amp', action='store_true') + parser.add_argument('--mask_rate', type=float, default=0.15) + parser.add_argument('--check', type=float, default=0.) args = parser.parse_args() + P.enable_static() + + if not os.path.exists(args.from_pretrained): + raise ValueError('--from_pretrained not found: %s' % + args.from_pretrained) + cfg_file_path = os.path.join(args.from_pretrained, 'ernie_config.json') + param_path = os.path.join(args.from_pretrained, 'params') + vocab_path = os.path.join(args.from_pretrained, 'vocab.txt') + assert os.path.exists(cfg_file_path) and os.path.exists( + param_path) and os.path.exists(vocab_path) + + hparams_cli = propeller.parse_hparam(args) + hparams_config_file = json.loads(open(cfg_file_path).read()) + default_hparams = propeller.HParams( + batch_size=50, + warmup_steps=10000, + learning_rate=1e-4, + weight_decay=0.01, ) + + hparams = default_hparams.join(propeller.HParams( + **hparams_config_file)).join(hparams_cli) + + default_run_config = dict( + max_steps=1000000, + save_steps=10000, + log_steps=10, + max_ckpt=3, + skip_steps=0, + eval_steps=-1) + + run_config = dict(default_run_config, **json.loads(args.run_config)) + run_config = propeller.RunConfig(**run_config) tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained) - train_ds = make_pretrain_dataset('train', args.data_dir, - vocab=tokenizer.vocab, args=args) + train_ds = make_pretrain_dataset( + 'train', + args.data_dir, + vocab=tokenizer.vocab, + hparams=hparams, + args=args) seq_shape = [-1, args.max_seqlen] - ints_shape = [-1,] - shapes = (seq_shape, seq_shape, ints_shape, [-1, 2], ints_shape) + ints_shape = [-1, ] + shapes = (seq_shape, seq_shape, ints_shape, [-1, 2], ints_shape) types = ('int64', 'int64', 'int64', 'int64', 'int64') train_ds.data_shapes = shapes train_ds.data_types = types - - place = F.CUDAPlace(D.parallel.Env().dev_id) - with D.guard(place): - model = ErnieModelForPretraining.from_pretrained(args.from_pretrained) - opt = AdamW(learning_rate=LinearDecay(args.lr, args.warmup_steps, args.max_steps), parameter_list=model.parameters(), weight_decay=0.01) - - ctx = D.parallel.prepare_context() - model = D.parallel.DataParallel(model, ctx) - - for step, samples in enumerate(tqdm(train_ds.start(place))): - (src_ids, sent_ids, mlm_label, mask_pos, nsp_label) = samples - loss, mlmloss, nsploss = model(src_ids, sent_ids, labels=mlm_label, mlm_pos=mask_pos, nsp_labels=nsp_label) - scaled_loss = model.scale_loss(loss) - scaled_loss.backward() - model.apply_collective_grads() - opt.minimize(scaled_loss) - model.clear_gradients() - if step % 10 == 0: - log.debug('train loss %.5f scaled loss %.5f' % (loss.numpy(), scaled_loss.numpy())) - if step % 10000 == 0 and D.parallel.Env().dev_id == 0 and args.save_dir is not None: - F.save_dygraph(model.state_dict(), args.save_dir) - - - + ws = None + + #varname_to_warmstart = re.compile(r'^encoder.*[wb]_0$|^.*embedding$|^.*bias$|^.*scale$|^pooled_fc.[wb]_0$') + varname_to_warmstart = re.compile(r'.*') + if args.from_pretrained is not None: + warm_start_dir = os.path.join(args.from_pretrained, 'params') + ws = propeller.WarmStartSetting( + predicate_fn=lambda v: varname_to_warmstart.match(v.name) and os.path.exists(os.path.join(warm_start_dir, v.name)), + from_dir=warm_start_dir + ) + + ernie_learner = propeller.Learner( + ernie_pretrain_model_fn, + run_config, + params=hparams, + warm_start_setting=ws) + ernie_learner.train(train_ds) diff --git a/demo/seq2seq/README.md b/demo/seq2seq/README.md index 87f0b49..608e5c0 100644 --- a/demo/seq2seq/README.md +++ b/demo/seq2seq/README.md @@ -23,15 +23,15 @@ python3 -m paddle.distributed.launch \ --max_steps $((287113*30/64)) ``` -Note that you need more than 2 GPUs to run the finetuning. -During multi-gpu finetuning, `max_steps` is used as stop criteria rather than `epoch` to prevent dead block. +Note that you need more than 2 GPUs to run the finetuning. +During multi-gpu finetuning, `max_steps` is used as stop criteria rather than `epoch` to prevent dead block. We simply canculate `max_steps` with: `EPOCH * NUM_TRIAN_EXAMPLE / TOTAL_BATCH`. This demo script will save a finetuned model at `--save_dir`, and do muti-gpu prediction every `--eval_steps` and save prediction results at `--predict_output_dir`. ### Evalution -While finetuning, a serials of prediction files is generated. +While finetuning, a serials of prediction files is generated. First you need to sort and join all files with: ```shell @@ -40,13 +40,13 @@ sort -t$'\t' -k1n ./pred/pred.step60000.* |awk -F"\t" '{print $2}'> final_predic then use `./eval_cnndm/cnndm_eval.sh` to calcuate all metrics (`pyrouge` is required to evalute CNN/Daily Mail.) - + ```shell sh cnndm_eval.sh final_prediction ./data/cnndm/dev.summary ``` -### Inference +### Inference To run beam serach decode after you got a finetuned model. try: @@ -57,5 +57,3 @@ cat one_column_source_text| python3 demo/seq2seq/decode.py \ --save_dir ./model_cnndm \ --bsz 8 ``` - - diff --git a/demo/seq2seq/decode.py b/demo/seq2seq/decode.py index 2ea427b..0d551e7 100644 --- a/demo/seq2seq/decode.py +++ b/demo/seq2seq/decode.py @@ -12,35 +12,31 @@ # See the License for the specific language governing permissions and # limitations under the License. - from __future__ import division from __future__ import absolute_import from __future__ import print_function from __future__ import unicode_literals import sys +import io import re import argparse import logging import json import numpy as np +from pathlib import Path from collections import namedtuple -import paddle.fluid as F -import paddle.fluid.layers as L -import paddle.fluid.dygraph as D +import paddle as P +from paddle.nn import functional as F from ernie.modeling_ernie import ErnieModel, ErnieModelForPretraining, ErnieModelForGeneration from ernie.modeling_ernie import _build_linear, _build_ln, append_name from ernie.tokenizing_ernie import ErnieTokenizer - from propeller import log import propeller.paddle as propeller -logging.getLogger().handlers[0]=log.handlers[0] -logging.getLogger().setLevel(logging.DEBUG) -log = logging.getLogger() @np.vectorize def rev_lookup(i): @@ -49,15 +45,27 @@ def rev_lookup(i): def gen_bias(encoder_inputs, decoder_inputs, step): decoder_bsz, decoder_seqlen = decoder_inputs.shape[:2] - attn_bias = L.reshape(L.range(0, decoder_seqlen, 1, dtype='float32') + 1, [1, -1, 1]) - decoder_bias = L.cast((L.matmul(attn_bias, 1. / attn_bias, transpose_y=True) >= 1.) , 'float32') #[1, 1, decoderlen, decoderlen] - encoder_bias = L.unsqueeze(L.cast(L.ones_like(encoder_inputs), 'float32'), [1]) #[bsz, 1, encoderlen] - encoder_bias = L.expand(encoder_bias, [1,decoder_seqlen, 1]) #[bsz,decoderlen, encoderlen] - decoder_bias = L.expand(decoder_bias, [decoder_bsz, 1, 1]) #[bsz, decoderlen, decoderlen] - if step > 0: - bias = L.concat([encoder_bias, L.ones([decoder_bsz, decoder_seqlen, step], 'float32'), decoder_bias], -1) + attn_bias = P.reshape( + P.arange( + 0, decoder_seqlen, 1, dtype='float32') + 1, [1, -1, 1]) + decoder_bias = P.cast( + (P.matmul( + attn_bias, 1. / attn_bias, transpose_y=True) >= 1.), + 'float32') #[1, 1, decoderlen, decoderlen] + encoder_bias = P.unsqueeze( + P.cast(P.ones_like(encoder_inputs), 'float32'), + [1]) #[bsz, 1, encoderlen] + encoder_bias = P.tile( + encoder_bias, [1, decoder_seqlen, 1]) #[bsz,decoderlen, encoderlen] + decoder_bias = P.tile(decoder_bias, + [decoder_bsz, 1, 1]) #[bsz, decoderlen, decoderlen] + if step > 0: + bias = P.concat([ + encoder_bias, P.ones([decoder_bsz, decoder_seqlen, step], + 'float32'), decoder_bias + ], -1) else: - bias = L.concat([encoder_bias, decoder_bias], -1) + bias = P.concat([encoder_bias, decoder_bias], -1) return bias @@ -80,56 +88,80 @@ def gen_bias(encoder_inputs, decoder_inputs, step): # return all_ids, all_sids -@D.no_grad -def greedy_search_infilling(model, q_ids, q_sids, sos_id, eos_id, attn_id, max_encode_len=640, max_decode_len=100, tgt_type_id=3): +def greedy_search_infilling(model, + q_ids, + q_sids, + sos_id, + eos_id, + attn_id, + max_encode_len=640, + max_decode_len=100, + tgt_type_id=3): model.eval() - #log.debug(q_ids.numpy().tolist()) - _, logits, info = model(q_ids, q_sids) - gen_ids = L.argmax(logits, -1) - d_batch, d_seqlen = q_ids.shape - seqlen = L.reduce_sum(L.cast(q_ids != 0, 'int64'), 1, keep_dim=True) - log.debug(seqlen.numpy()) - log.debug(d_seqlen) - has_stopped = np.zeros([d_batch], dtype=np.bool) - gen_seq_len = np.zeros([d_batch], dtype=np.int64) - output_ids = [] - - past_cache = info['caches'] - - cls_ids = L.ones([d_batch], dtype='int64') * sos_id - attn_ids = L.ones([d_batch], dtype='int64') * attn_id - ids = L.stack([cls_ids, attn_ids], -1) - for step in range(max_decode_len): - log.debug('decode step %d' % step) - bias = gen_bias(q_ids, ids, step) - pos_ids = D.to_variable(np.tile(np.array([[step, step + 1]], dtype=np.int64), [d_batch, 1])) - pos_ids += seqlen - _, logits, info = model(ids, L.ones_like(ids) * tgt_type_id, pos_ids=pos_ids, attn_bias=bias, past_cache=past_cache) - gen_ids = L.argmax(logits, -1) - - past_cached_k, past_cached_v = past_cache - cached_k, cached_v = info['caches'] - cached_k = [L.concat([pk, k[:, :1, :]], 1) for pk, k in zip(past_cached_k, cached_k)] # concat cached - cached_v = [L.concat([pv, v[:, :1, :]], 1) for pv, v in zip(past_cached_v, cached_v)] - past_cache = (cached_k, cached_v) - - gen_ids = gen_ids[:, 1] - ids = L.stack([gen_ids, attn_ids], 1) - - gen_ids = gen_ids.numpy() - has_stopped |= (gen_ids == eos_id).astype(np.bool) - gen_seq_len += (1 - has_stopped.astype(np.int64)) - output_ids.append(gen_ids.tolist()) - if has_stopped.all(): - #log.debug('exit because all done') - break - #if step == 1: break - output_ids = np.array(output_ids).transpose([1, 0]) + with P.no_grad(): + #log.debug(q_ids.numpy().tolist()) + _, logits, info = model(q_ids, q_sids) + gen_ids = P.argmax(logits, -1) + d_batch, d_seqlen = q_ids.shape + seqlen = P.cast(q_ids != 0, 'int64').sum(1, keepdim=True) + log.debug(seqlen.numpy()) + log.debug(d_seqlen) + has_stopped = np.zeros([d_batch], dtype=np.bool) + gen_seq_len = np.zeros([d_batch], dtype=np.int64) + output_ids = [] + + past_cache = info['caches'] + + cls_ids = P.ones([d_batch], dtype='int64') * sos_id + attn_ids = P.ones([d_batch], dtype='int64') * attn_id + ids = P.stack([cls_ids, attn_ids], -1) + for step in range(max_decode_len): + log.debug('decode step %d' % step) + bias = gen_bias(q_ids, ids, step) + pos_ids = P.to_tensor( + np.tile( + np.array( + [[step, step + 1]], dtype=np.int64), [d_batch, 1])) + pos_ids += seqlen + _, logits, info = model( + ids, + P.ones_like(ids) * tgt_type_id, + pos_ids=pos_ids, + attn_bias=bias, + past_cache=past_cache) + gen_ids = P.argmax(logits, -1) + + past_cached_k, past_cached_v = past_cache + cached_k, cached_v = info['caches'] + cached_k = [ + P.concat([pk, k[:, :1, :]], 1) + for pk, k in zip(past_cached_k, cached_k) + ] # concat cached + cached_v = [ + P.concat([pv, v[:, :1, :]], 1) + for pv, v in zip(past_cached_v, cached_v) + ] + past_cache = (cached_k, cached_v) + + gen_ids = gen_ids[:, 1] + ids = P.stack([gen_ids, attn_ids], 1) + + gen_ids = gen_ids.numpy() + has_stopped |= (gen_ids == eos_id).astype(np.bool) + gen_seq_len += (1 - has_stopped.astype(np.int64)) + output_ids.append(gen_ids.tolist()) + if has_stopped.all(): + #log.debug('exit because all done') + break + #if step == 1: break + output_ids = np.array(output_ids).transpose([1, 0]) return output_ids -BeamSearchState = namedtuple('BeamSearchState', ['log_probs', 'lengths', 'finished']) -BeamSearchOutput = namedtuple('BeamSearchOutput', ['scores', 'predicted_ids', 'beam_parent_ids']) +BeamSearchState = namedtuple('BeamSearchState', + ['log_probs', 'lengths', 'finished']) +BeamSearchOutput = namedtuple('BeamSearchOutput', + ['scores', 'predicted_ids', 'beam_parent_ids']) def log_softmax(x): @@ -138,136 +170,190 @@ def log_softmax(x): def mask_prob(p, onehot_eos, finished): - is_finished = L.cast(L.reshape(finished, [-1, 1]) != 0, 'float32') - p = is_finished * (1. - L.cast(onehot_eos, 'float32')) * -9999. + (1. - is_finished) * p + is_finished = P.cast(P.reshape(finished, [-1, 1]) != 0, 'float32') + p = is_finished * (1. - P.cast(onehot_eos, 'float32')) * -9999. + ( + 1. - is_finished) * p return p def hyp_score(log_probs, length, length_penalty): - lp = L.pow((5.+L.cast(length, 'float32')) / 6., length_penalty) + lp = P.pow((5. + P.cast(length, 'float32')) / 6., length_penalty) return log_probs / lp -def beam_search_step(state, logits, eos_id, beam_width, is_first_step, length_penalty): +def beam_search_step(state, logits, eos_id, beam_width, is_first_step, + length_penalty): """logits.shape == [B*W, V]""" _, vocab_size = logits.shape bsz, beam_width = state.log_probs.shape - onehot_eos = L.cast(F.one_hot(L.ones([1], 'int64') * eos_id, vocab_size), 'int64') #[1, V] + onehot_eos = P.cast( + F.one_hot(P.ones([1], 'int64') * eos_id, vocab_size), 'int64') #[1, V] - probs = L.log(L.softmax(logits)) #[B*W, V] - probs = mask_prob(probs, onehot_eos, state.finished) #[B*W, V] - allprobs = L.reshape(state.log_probs, [-1, 1]) + probs #[B*W, V] + probs = P.log(F.softmax(logits)) #[B*W, V] + probs = mask_prob(probs, onehot_eos, state.finished) #[B*W, V] + allprobs = P.reshape(state.log_probs, [-1, 1]) + probs #[B*W, V] - not_finished = 1 - L.reshape(state.finished, [-1, 1]) #[B*W,1] + not_finished = 1 - P.reshape(state.finished, [-1, 1]) #[B*W,1] not_eos = 1 - onehot_eos - length_to_add = not_finished * not_eos #[B*W,V] - alllen = L.reshape(state.lengths, [-1, 1]) + length_to_add + length_to_add = not_finished * not_eos #[B*W,V] + alllen = P.reshape(state.lengths, [-1, 1]) + length_to_add - allprobs = L.reshape(allprobs, [-1, beam_width * vocab_size]) - alllen = L.reshape(alllen, [-1, beam_width * vocab_size]) + allprobs = P.reshape(allprobs, [-1, beam_width * vocab_size]) + alllen = P.reshape(alllen, [-1, beam_width * vocab_size]) allscore = hyp_score(allprobs, alllen, length_penalty) if is_first_step: - allscore = L.reshape(allscore, [bsz, beam_width, -1])[:,0,:] # first step only consiter beam 0 - scores, idx = L.topk(allscore, k=beam_width) #[B, W] - next_beam_id = idx // vocab_size #[B, W] + allscore = P.reshape( + allscore, + [bsz, beam_width, -1])[:, 0, :] # first step only consiter beam 0 + scores, idx = P.topk(allscore, k=beam_width) #[B, W] + next_beam_id = idx // vocab_size #[B, W] next_word_id = idx % vocab_size - gather_idx = L.concat([L.where(idx!=-1)[:, :1], L.reshape(idx, [-1, 1])], 1) - next_probs = L.reshape(L.gather_nd(allprobs, gather_idx), idx.shape) - next_len = L.reshape(L.gather_nd(alllen, gather_idx), idx.shape) - - gather_idx = L.concat([L.where(next_beam_id!=-1)[:, :1], L.reshape(next_beam_id, [-1, 1])], 1) - next_finished = L.reshape(L.gather_nd(state.finished, gather_idx), state.finished.shape) #[gather new beam state according to new beam id] + gather_idx = P.concat( + [P.nonzero(idx != -1)[:, :1], P.reshape(idx, [-1, 1])], 1) + next_probs = P.reshape(P.gather_nd(allprobs, gather_idx), idx.shape) + next_len = P.reshape(P.gather_nd(alllen, gather_idx), idx.shape) + + gather_idx = P.concat([ + P.nonzero(next_beam_id != -1)[:, :1], P.reshape(next_beam_id, [-1, 1]) + ], 1) + next_finished = P.reshape( + P.gather_nd(state.finished, gather_idx), state.finished. + shape) #[gather new beam state according to new beam id] #log.debug(gather_idx.numpy()) #log.debug(state.finished.numpy()) #log.debug(next_finished.numpy()) - next_finished += L.cast(next_word_id==eos_id, 'int64') - next_finished = L.cast(next_finished > 0, 'int64') + next_finished += P.cast(next_word_id == eos_id, 'int64') + next_finished = P.cast(next_finished > 0, 'int64') #log.debug(next_word_id.numpy()) #log.debug(next_beam_id.numpy()) - next_state = BeamSearchState(log_probs=next_probs, lengths=next_len, finished=next_finished) - output = BeamSearchOutput(scores=scores, predicted_ids=next_word_id, beam_parent_ids=next_beam_id) + next_state = BeamSearchState( + log_probs=next_probs, lengths=next_len, finished=next_finished) + output = BeamSearchOutput( + scores=scores, + predicted_ids=next_word_id, + beam_parent_ids=next_beam_id) return output, next_state -@D.no_grad -def beam_search_infilling(model, q_ids, q_sids, sos_id, eos_id, attn_id, max_encode_len=640, max_decode_len=100, beam_width=5, tgt_type_id=3, length_penalty=1.0): +def beam_search_infilling(model, + q_ids, + q_sids, + sos_id, + eos_id, + attn_id, + max_encode_len=640, + max_decode_len=100, + beam_width=5, + tgt_type_id=3, + length_penalty=1.0): model.eval() - #log.debug(q_ids.numpy().tolist()) - _, __, info = model(q_ids, q_sids) - d_batch, d_seqlen = q_ids.shape - - state = BeamSearchState( - log_probs=L.zeros([d_batch, beam_width], 'float32'), - lengths=L.zeros([d_batch, beam_width], 'int64'), - finished=L.zeros([d_batch, beam_width], 'int64')) - outputs = [] - - def reorder_(t, parent_id): - """reorder cache according to parent beam id""" - gather_idx = L.where(parent_id!=-1)[:, 0] * beam_width + L.reshape(parent_id, [-1]) - t = L.gather(t, gather_idx) - return t - - def tile_(t, times): - _shapes = list(t.shape[1:]) - ret = L.reshape(L.expand(L.unsqueeze(t, [1]), [1, times,] + [1,] * len(_shapes)), [-1,] + _shapes) - return ret - - cached_k, cached_v = info['caches'] - cached_k = [tile_(k, beam_width)for k in cached_k] - cached_v = [tile_(v, beam_width)for v in cached_v] - past_cache = (cached_k, cached_v) - - q_ids = tile_(q_ids, beam_width) - seqlen = L.reduce_sum(L.cast(q_ids != 0, 'int64'), 1, keep_dim=True) - #log.debug(q_ids.shape) - - cls_ids = L.ones([d_batch * beam_width], dtype='int64') * sos_id - attn_ids = L.ones([d_batch * beam_width], dtype='int64') * attn_id # SOS - ids = L.stack([cls_ids, attn_ids], -1) - for step in range(max_decode_len): - #log.debug('decode step %d' % step) - bias = gen_bias(q_ids, ids, step) - pos_ids = D.to_variable(np.tile(np.array([[step, step + 1]], dtype=np.int64), [d_batch * beam_width, 1])) - pos_ids += seqlen - _, logits, info = model(ids, L.ones_like(ids) * tgt_type_id, pos_ids=pos_ids, attn_bias=bias, past_cache=past_cache) - - - output, state = beam_search_step(state, logits[:, 1], - eos_id=eos_id, - beam_width=beam_width, - is_first_step=(step==0), - length_penalty=length_penalty) - outputs.append(output) + with P.no_grad(): + #log.debug(q_ids.numpy().tolist()) + _, __, info = model(q_ids, q_sids) + d_batch, d_seqlen = q_ids.shape + + state = BeamSearchState( + log_probs=P.zeros([d_batch, beam_width], 'float32'), + lengths=P.zeros([d_batch, beam_width], 'int64'), + finished=P.zeros([d_batch, beam_width], 'int64')) + outputs = [] + + def reorder_(t, parent_id): + """reorder cache according to parent beam id""" + gather_idx = P.nonzero( + parent_id != -1)[:, 0] * beam_width + P.reshape(parent_id, + [-1]) + t = P.gather(t, gather_idx) + return t + + def tile_(t, times): + _shapes = list(t.shape[1:]) + ret = P.reshape( + P.tile( + P.unsqueeze(t, [1]), [ + 1, + times, + ] + [1, ] * len(_shapes)), [-1, ] + _shapes) + return ret - past_cached_k, past_cached_v = past_cache cached_k, cached_v = info['caches'] - cached_k = [reorder_(L.concat([pk, k[:, :1, :]], 1), output.beam_parent_ids) for pk, k in zip(past_cached_k, cached_k)] # concat cached - cached_v = [reorder_(L.concat([pv, v[:, :1, :]], 1), output.beam_parent_ids) for pv, v in zip(past_cached_v, cached_v)] + cached_k = [tile_(k, beam_width) for k in cached_k] + cached_v = [tile_(v, beam_width) for v in cached_v] past_cache = (cached_k, cached_v) + q_ids = tile_(q_ids, beam_width) + seqlen = P.cast(q_ids != 0, 'int64').sum(1, keepdim=True) + #log.debug(q_ids.shape) + + cls_ids = P.ones([d_batch * beam_width], dtype='int64') * sos_id + attn_ids = P.ones( + [d_batch * beam_width], dtype='int64') * attn_id # SOS + ids = P.stack([cls_ids, attn_ids], -1) + for step in range(max_decode_len): + #log.debug('decode step %d' % step) + bias = gen_bias(q_ids, ids, step) + pos_ids = P.to_tensor( + np.tile( + np.array( + [[step, step + 1]], dtype=np.int64), + [d_batch * beam_width, 1])) + pos_ids += seqlen + _, logits, info = model( + ids, + P.ones_like(ids) * tgt_type_id, + pos_ids=pos_ids, + attn_bias=bias, + past_cache=past_cache) + + output, state = beam_search_step( + state, + logits[:, 1], + eos_id=eos_id, + beam_width=beam_width, + is_first_step=(step == 0), + length_penalty=length_penalty) + outputs.append(output) + + past_cached_k, past_cached_v = past_cache + cached_k, cached_v = info['caches'] + cached_k = [ + reorder_( + P.concat([pk, k[:, :1, :]], 1), output.beam_parent_ids) + for pk, k in zip(past_cached_k, cached_k) + ] # concat cached + cached_v = [ + reorder_( + P.concat([pv, v[:, :1, :]], 1), output.beam_parent_ids) + for pv, v in zip(past_cached_v, cached_v) + ] + past_cache = (cached_k, cached_v) + + pred_ids_flatten = P.reshape(output.predicted_ids, + [d_batch * beam_width]) + ids = P.stack([pred_ids_flatten, attn_ids], 1) + + if state.finished.numpy().all(): + #log.debug('exit because all done') + break + #if step == 1: break + + final_ids = P.stack([o.predicted_ids for o in outputs], 0) + final_parent_ids = P.stack([o.beam_parent_ids for o in outputs], 0) + final_ids = P.fluid.layers.gather_tree( + final_ids, final_parent_ids)[:, :, 0] #pick best beam + final_ids = P.transpose( + P.reshape(final_ids, [-1, d_batch * 1]), [1, 0]) + return final_ids - pred_ids_flatten = L.reshape(output.predicted_ids, [d_batch * beam_width]) - ids = L.stack([pred_ids_flatten, attn_ids], 1) - - if state.finished.numpy().all(): - #log.debug('exit because all done') - break - #if step == 1: break - final_ids = L.stack([o.predicted_ids for o in outputs], 0) - final_parent_ids = L.stack([o.beam_parent_ids for o in outputs], 0) - final_ids = L.gather_tree(final_ids, final_parent_ids)[:,:,0] #pick best beam - final_ids = L.transpose(L.reshape(final_ids, [-1, d_batch * 1]), [1, 0]) - return final_ids - en_patten = re.compile(r'^[a-zA-Z0-9]*$') + def post_process(token): if token.startswith('##'): ret = token[2:] @@ -280,66 +366,86 @@ def post_process(token): if __name__ == '__main__': + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') + sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8') + parser = argparse.ArgumentParser('seq2seq model with ERNIE') - parser.add_argument('--from_pretrained', type=str, required=True, help='pretrained model directory or tag') + parser.add_argument( + '--from_pretrained', + type=Path, + required=True, + help='pretrained model directory or tag') parser.add_argument('--bsz', type=int, default=8, help='batchsize') parser.add_argument('--max_encode_len', type=int, default=640) parser.add_argument('--max_decode_len', type=int, default=120) parser.add_argument('--tgt_type_id', type=int, default=3) parser.add_argument('--beam_width', type=int, default=5) - parser.add_argument('--attn_token', type=str, default='[ATTN]', help='if [ATTN] not in vocab, you can specified [MAKK] as attn-token') + parser.add_argument( + '--attn_token', + type=str, + default='[ATTN]', + help='if [ATTN] not in vocab, you can specified [MAKK] as attn-token') parser.add_argument('--length_penalty', type=float, default=1.0) - parser.add_argument('--save_dir', type=str, required=True, help='model dir to be loaded') + parser.add_argument( + '--save_dir', type=str, required=True, help='model dir to be loaded') args = parser.parse_args() - place = F.CUDAPlace(D.parallel.Env().dev_id) - D.guard(place).__enter__() + env = P.distributed.ParallelEnv() - ernie = ErnieModelForGeneration.from_pretrained(args.from_pretrained, name='') - tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained, mask_token=None) + ernie = ErnieModelForGeneration.from_pretrained( + args.from_pretrained, name='') + tokenizer = ErnieTokenizer.from_pretrained( + args.from_pretrained, mask_token=None) rev_dict = {v: k for k, v in tokenizer.vocab.items()} - rev_dict[tokenizer.pad_id] = '' # replace [PAD] - rev_dict[tokenizer.unk_id] = '' # replace [PAD] + rev_dict[tokenizer.pad_id] = '' # replace [PAD] + rev_dict[tokenizer.unk_id] = '' # replace [PAD] - sd, _ = D.load_dygraph(args.save_dir) - ernie.set_dict(sd) + sd = P.load(args.save_dir) + ernie.set_state_dict(sd) def map_fn(src_ids): - src_ids = src_ids[: args.max_encode_len] + src_ids = src_ids[:args.max_encode_len] src_ids, src_sids = tokenizer.build_for_ernie(src_ids) return (src_ids, src_sids) feature_column = propeller.data.FeatureColumns([ - propeller.data.TextColumn('seg_a', unk_id=tokenizer.unk_id, vocab_dict=tokenizer.vocab, tokenizer=tokenizer.tokenize), + propeller.data.TextColumn( + 'seg_a', + unk_id=tokenizer.unk_id, + vocab_dict=tokenizer.vocab, + tokenizer=tokenizer.tokenize), ]) - dataset = feature_column.build_dataset_from_stdin('predict').map(map_fn).padded_batch(args.bsz) + dataset = feature_column.build_dataset_from_stdin('predict').map( + map_fn).padded_batch(args.bsz) for step, (encoder_ids, encoder_sids) in enumerate(dataset): - #result_ids = greedy_search_infilling(ernie, D.to_variable(encoder_ids), D.to_variable(encoder_sids), + #result_ids = greedy_search_infilling(ernie, P.to_tensor(encoder_ids), P.to_tensor(encoder_sids), # eos_id=tokenizer.sep_id, # sos_id=tokenizer.cls_id, # attn_id=tokenizer.vocab[args.attn_id], - # max_decode_len=args.max_decode_len, - # max_encode_len=args.max_encode_len, + # max_decode_len=args.max_decode_len, + # max_encode_len=args.max_encode_len, # beam_width=args.beam_width, # tgt_type_id=args.tgt_type_id) - result_ids = beam_search_infilling(ernie, D.to_variable(encoder_ids), D.to_variable(encoder_sids), - eos_id=tokenizer.sep_id, - sos_id=tokenizer.cls_id, - attn_id=tokenizer.vocab[args.attn_token], - max_decode_len=args.max_decode_len, - max_encode_len=args.max_encode_len, - beam_width=args.beam_width, - length_penalty=args.length_penalty, - tgt_type_id=args.tgt_type_id) + result_ids = beam_search_infilling( + ernie, + P.to_tensor(encoder_ids), + P.to_tensor(encoder_sids), + eos_id=tokenizer.sep_id, + sos_id=tokenizer.cls_id, + attn_id=tokenizer.vocab[args.attn_token], + max_decode_len=args.max_decode_len, + max_encode_len=args.max_encode_len, + beam_width=args.beam_width, + length_penalty=args.length_penalty, + tgt_type_id=args.tgt_type_id) output_str = rev_lookup(result_ids.numpy()) for ostr in output_str.tolist(): if '[SEP]' in ostr: - ostr = ostr[: ostr.index('[SEP]')] - + ostr = ostr[:ostr.index('[SEP]')] + ostr = ''.join(map(post_process, ostr)) ostr = ostr.strip() print(ostr) - diff --git a/demo/seq2seq/finetune_seq2seq.py b/demo/seq2seq/finetune_seq2seq.py new file mode 100644 index 0000000..937ad88 --- /dev/null +++ b/demo/seq2seq/finetune_seq2seq.py @@ -0,0 +1,420 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from __future__ import absolute_import +from __future__ import print_function +from __future__ import unicode_literals + +import sys +import argparse +import logging +import json +import re +import os +import numpy as np + +from pathlib import Path +from copy import deepcopy + +import paddle as P +from paddle.nn import functional as F + +from tqdm import tqdm + +from ernie.modeling_ernie import ErnieModel, ErnieModelForPretraining, ErnieModelForGeneration +from ernie.modeling_ernie import _build_linear, _build_ln, append_name +from ernie.tokenizing_ernie import ErnieTokenizer +#from ernie.optimization import AdamW, LinearDecay + +from demo.seq2seq.decode import beam_search_infilling, post_process +from demo.utils import create_if_not_exists, get_warmup_and_linear_decay + +from propeller import log +log.setLevel(logging.DEBUG) +import propeller.paddle as propeller + + +@np.vectorize +def rev_lookup(i): + return rev_dict[i] + + +def evaluate(model, datasets, step, args): + predict_output_dir = args.predict_output_dir / ('pred.step%d.%d' % + (step, env.dev_id)) + with predict_output_dir.open('w') as outf: + with P.amp.auto_cast(enable=False): + for step, data in enumerate( + P.io.DataLoader( + datasets, + places=P.CUDAPlace(env.dev_id), + batch_size=None)): + (example_id, src_ids, src_sids, src_pids, _, _, _, _, _, _, _, + _) = data # never use target when infer + output_ids = beam_search_infilling( + model, + src_ids, + src_sids, + eos_id=tokenizer.sep_id, + sos_id=tokenizer.cls_id, + attn_id=tokenizer.vocab[args.attn_token], + max_decode_len=args.max_decode_len, + max_encode_len=args.max_encode_len, + beam_width=args.beam_width, + length_penalty=args.length_penalty, + tgt_type_id=args.tgt_type_id, ) + output_str = rev_lookup(output_ids.numpy()) + for eid, ostr in zip(example_id.numpy().tolist(), + output_str.tolist()): + if '[SEP]' in ostr: + ostr = ostr[:ostr.index('[SEP]')] + ostr = ''.join(map(post_process, ostr)) + print('%d\t%s' % (eid, ostr), file=outf) + + model.train() + + +def seq2seq(model, tokenizer, args): + log.info('Training starts with args: %r' % args) + attn_id = tokenizer.vocab[args.attn_token] + + def gen_mask(batch_ids, mask_type='bidi', query_len=None, pad_value=0): + if query_len is None: + query_len = batch_ids.shape[1] + if mask_type != 'empty': + mask = (batch_ids != pad_value).astype(np.float32) + mask = np.tile(np.expand_dims(mask, 1), [1, query_len, 1]) + if mask_type == 'causal': + assert query_len == batch_ids.shape[1] + mask = np.tril(mask) + elif mask_type == 'causal_without_diag': + assert query_len == batch_ids.shape[1] + mask = np.tril(mask, -1) + elif mask_type == 'diag': + assert query_len == batch_ids.shape[1] + mask = np.stack([np.diag(np.diag(m)) for m in mask], 0) + else: + mask_type == 'empty' + mask = np.zeros_like(batch_ids).astype(np.float32) + mask = np.tile(np.expand_dims(mask, 1), [1, query_len, 1]) + return mask + + def make_some_noice(ids): + if args.use_random_noice: + noice_ids = np.random.randint( + 1, len(tokenizer.vocab), size=ids.shape) + else: + noice_ids = np.ones_like(ids) * tokenizer.vocab['[NOISE]'] + pos, = np.where(np.ones_like(ids)) + np.random.shuffle(pos) + pos = pos[:int(args.noise_prob * len(pos))] + ids[pos, ] = noice_ids[pos, ] + return ids + + def map_fn(example_id, src_ids, tgt_ids): + src_ids = src_ids[:args.max_encode_len] + tgt_ids = tgt_ids[:args.max_decode_len] + src_ids, src_sids = tokenizer.build_for_ernie(src_ids) + src_pids = np.arange(len(src_ids)) + + tgt_ids, tgt_sids = tokenizer.build_for_ernie(tgt_ids) + tgt_pids = np.arange(len(tgt_ids)) + len(src_ids) # continues position + tgt_sids = np.ones_like(tgt_sids) * args.tgt_type_id + + attn_ids = np.ones_like(tgt_ids) * attn_id + if args.noise_prob > 0.: + tgt_labels = deepcopy(tgt_ids) + tgt_ids = make_some_noice(tgt_ids) #corrupted + else: + tgt_labels = tgt_ids + + return (example_id, src_ids, src_pids, src_sids, tgt_ids, tgt_pids, + tgt_sids, attn_ids, tgt_labels) + + def after_padding(example_id, src_ids, src_pids, src_sids, tgt_ids, + tgt_pids, tgt_sids, attn_ids, tgt_labels): + ''' + attention mask: + *** src, tgt, attn + src 00, 01, 11 + tgt 10, 11, 12 + attn 20, 21, 22 + + *** s1, s2 | t1 t2 t3| attn1 attn2 attn3 + s1 1, 1 | 0, 0, 0,| 0, 0, 0, + s2 1, 1 | 0, 0, 0,| 0, 0, 0, + - + t1 1, 1, | 1, 0, 0,| 0, 0, 0, + t2 1, 1, | 1, 1, 0,| 0, 0, 0, + t3 1, 1, | 1, 1, 1,| 0, 0, 0, + - + attn1 1, 1, | 0, 0, 0,| 1, 0, 0, + attn2 1, 1, | 1, 0, 0,| 0, 1, 0, + attn3 1, 1, | 1, 1, 0,| 0, 0, 1, + + for details, see Fig3. https://arxiv.org/abs/2001.11314 + ''' + + src_len = src_ids.shape[1] + tgt_len = tgt_ids.shape[1] + mask_00 = gen_mask(src_ids, 'bidi', query_len=src_len) + mask_01 = gen_mask(tgt_ids, 'empty', query_len=src_len) + mask_02 = gen_mask(attn_ids, 'empty', query_len=src_len) + + mask_10 = gen_mask(src_ids, 'bidi', query_len=tgt_len) + mask_11 = gen_mask(tgt_ids, 'causal', query_len=tgt_len) + mask_12 = gen_mask(attn_ids, 'empty', query_len=tgt_len) + + mask_20 = gen_mask(src_ids, 'bidi', query_len=tgt_len) + mask_21 = gen_mask(tgt_ids, 'causal_without_diag', query_len=tgt_len) + mask_22 = gen_mask(attn_ids, 'diag', query_len=tgt_len) + ''' + mask = np.concatenate([ + np.concatenate([mask_00, mask_01, mask_02], 2), + np.concatenate([mask_10, mask_11, mask_12], 2), + np.concatenate([mask_20, mask_21, mask_22], 2), + ], 1) + + ids = np.concatenate([src_ids, tgt_ids, attn_ids], 1) + pids = np.concatenate([src_pids, tgt_pids, tgt_pids], 1) + sids = np.concatenate([src_sids, tgt_sids, tgt_sids], 1) + + ''' + + mask_src_2_src = mask_00 + mask_tgt_2_srctgt = np.concatenate([mask_10, mask_11], 2) + mask_attn_2_srctgtattn = np.concatenate([mask_20, mask_21, mask_22], 2) + + tgt_labels = tgt_labels[np.where(tgt_labels != 0)] + return (example_id, src_ids, src_sids, src_pids, tgt_ids, tgt_sids, + tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, + mask_attn_2_srctgtattn, tgt_labels) + + bytes_vocab = {k.encode('utf8'): v for k, v in tokenizer.vocab.items()} + feature_column = propeller.data.FeatureColumns([ + propeller.data.LabelColumn('id'), + propeller.data.TextColumn( + 'src', unk_id=tokenizer.unk_id, vocab_dict=bytes_vocab), + propeller.data.TextColumn( + 'tgt', unk_id=tokenizer.unk_id, vocab_dict=bytes_vocab), + ]) + + train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=True, repeat=True, use_gz=False) \ + .map(map_fn) \ + .padded_batch(args.bsz) \ + .map(after_padding) + + + dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \ + .map(map_fn) \ + .padded_batch(args.eval_bsz) \ + .map(after_padding) \ + .shard(env.nranks, env.dev_id) + + vocab_size, _ = model.word_emb.weight.shape + model = P.DataParallel(model) + g_clip = P.nn.ClipGradByGlobalNorm(1.0) + param_name_to_exclue_from_weight_decay = re.compile( + r'.*layer_norm_scale|.*layer_norm_bias|.*b_0') + lr_scheduler = P.optimizer.lr.LambdaDecay( + args.lr, + get_warmup_and_linear_decay( + args.max_steps, int(args.warmup_proportion * args.max_steps))) + + opt = P.optimizer.AdamW( + learning_rate=lr_scheduler, + parameters=model.parameters(), + weight_decay=args.wd, + apply_decay_param_fun=lambda n: param_name_to_exclue_from_weight_decay.match(n), + grad_clip=g_clip) + + scaler = P.amp.GradScaler(enable=args.use_amp) + attn_id = tokenizer.vocab[args.attn_token] + create_if_not_exists(args.save_dir) + if args.predict_output_dir: + create_if_not_exists(args.predict_output_dir) + + with P.amp.auto_cast(enable=args.use_amp): + for step, data in enumerate( + P.io.DataLoader( + train_ds, places=P.CUDAPlace(env.dev_id), + batch_size=None)): + (example_id, src_ids, src_sids, src_pids, tgt_ids, tgt_sids, + tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, + mask_attn_2_srctgtattn, tgt_labels) = data + + _, __, info = model( + src_ids, + sent_ids=src_sids, + pos_ids=src_pids, + attn_bias=mask_src_2_src, + encode_only=True) + cached_k, cached_v = info['caches'] + _, __, info = model( + tgt_ids, + sent_ids=tgt_sids, + pos_ids=tgt_pids, + attn_bias=mask_tgt_2_srctgt, + past_cache=(cached_k, cached_v), + encode_only=True) + cached_k2, cached_v2 = info['caches'] + past_cache_k = [ + P.concat([k, k2], 1) for k, k2 in zip(cached_k, cached_k2) + ] + past_cache_v = [ + P.concat([v, v2], 1) for v, v2 in zip(cached_v, cached_v2) + ] + tgt_labels = F.one_hot(tgt_labels, vocab_size) + if args.label_smooth > 0.: + tgt_labels = F.label_smooth( + tgt_labels, epsilon=args.label_smooth) + loss, _, __ = model( + attn_ids, + sent_ids=tgt_sids, + pos_ids=tgt_pids, + attn_bias=mask_attn_2_srctgtattn, + past_cache=(past_cache_k, past_cache_v), + tgt_labels=tgt_labels, + tgt_pos=P.nonzero(attn_ids == attn_id)) + + loss = scaler.scale(loss) + loss.backward() + scaler.minimize(opt, loss) + model.clear_gradients() + lr_scheduler.step() + + if step % 10 == 0: + _lr = lr_scheduler.get_lr() + if args.use_amp: + _l = (loss / scaler._scale).numpy() + msg = '[rank-%d][step-%d] train loss %.5f lr %.3e scaling %.3e' % ( + env.dev_id, step, _l, _lr, scaler._scale.numpy()) + else: + _l = loss.numpy() + msg = '[rank-%d][step-%d] train loss %.5f lr %.3e' % ( + env.dev_id, step, _l, _lr) + log.debug(msg) + + if args.save_dir is not None and step % 1000 == 0 and env.dev_id == 0: + P.save(model.state_dict(), args.save_dir / 'ckpt.bin') + + if args.predict_output_dir is not None and step > args.skip_eval_steps and step % args.eval_steps == 0: + assert args.predict_output_dir.exists(), \ + 'predict_output_dir not found: %s' % args.predict_output_dir + log.debug('doing predict on gpu %d...' % env.dev_id) + evaluate(model, dev_ds, step, args) + if step > args.max_steps: + break + evaluate(model, dev_ds, step, args) + + if args.save_dir is not None: + P.save(model.state_dict(), args.save_dir / 'ckpt.bin') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('seq2seq model with ERNIE') + parser.add_argument( + '--from_pretrained', + type=Path, + required=True, + help='pretrained model directory or tag') + parser.add_argument('--bsz', type=int, default=8, help='batchsize') + parser.add_argument('--eval_bsz', type=int, default=20, help='batchsize') + parser.add_argument( + '--data_dir', + type=str, + required=True, + help='data directory includes train / develop data') + parser.add_argument( + '--max_steps', + type=int, + required=True, + help='max_train_steps, set this to EPOCH * NUM_SAMPLES / BATCH_SIZE') + parser.add_argument( + '--eval_steps', type=int, default=5000, help='evaluation frequency') + parser.add_argument( + '--skip_eval_steps', + type=int, + default=1, + help='skip evaluate for first n step') + parser.add_argument('--max_encode_len', type=int, default=640) + parser.add_argument('--max_decode_len', type=int, default=120) + parser.add_argument('--tgt_type_id', type=int, default=3) + parser.add_argument('--warmup_proportion', type=float, default=0.1) + parser.add_argument('--beam_width', type=int, default=5) + parser.add_argument( + '--noise_prob', + type=float, + default=0.7, + help='probability of token be repalced') + parser.add_argument( + '--use_random_noice', + action='store_true', + help='if set, replace target tokens with random token from vocabulary, else replace with `[NOISE]`' + ) + parser.add_argument('--lr', type=float, default=5e-5, help='learning rate') + parser.add_argument('--label_smooth', type=float, default=0.1) + parser.add_argument('--length_penalty', type=float, default=1.0) + parser.add_argument( + '--predict_output_dir', + type=Path, + default=None, + help='predict file output directory') + parser.add_argument( + '--attn_token', + type=str, + default='[ATTN]', + help='if [ATTN] not in vocab, you can specified [MAKK] as attn-token') + parser.add_argument( + '--inference_model_dir', + type=str, + default=None, + help='inference model output directory') + parser.add_argument( + '--init_checkpoint', + type=str, + default=None, + help='checkpoint to warm start from') + parser.add_argument( + '--save_dir', type=Path, default=None, help='model output directory') + parser.add_argument( + '--wd', + type=float, + default=0.01, + help='weight decay, aka L2 regularizer') + parser.add_argument( + '--use_amp', + action='store_true', + help='only activate AMP(auto mixed precision accelatoin) on TensorCore compatible devices' + ) + + args = parser.parse_args() + + env = P.distributed.ParallelEnv() + P.distributed.init_parallel_env() + + ernie = ErnieModelForGeneration.from_pretrained(args.from_pretrained) + tokenizer = ErnieTokenizer.from_pretrained( + args.from_pretrained, mask_token=None) + rev_dict = {v: k for k, v in tokenizer.vocab.items()} + rev_dict[tokenizer.pad_id] = '' # replace [PAD] + rev_dict[tokenizer.unk_id] = '' # replace [PAD] + + if args.init_checkpoint is not None: + log.info('loading checkpoint from %s' % args.init_checkpoint) + sd = P.load(args.init_checkpoint) + ernie.set_state_dict(sd) + + seq2seq(ernie, tokenizer, args) diff --git a/demo/seq2seq/finetune_seq2seq_dygraph.py b/demo/seq2seq/finetune_seq2seq_dygraph.py deleted file mode 100644 index 77140b9..0000000 --- a/demo/seq2seq/finetune_seq2seq_dygraph.py +++ /dev/null @@ -1,318 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from __future__ import division -from __future__ import absolute_import -from __future__ import print_function -from __future__ import unicode_literals - -import sys -import argparse -import logging -import json -import os -import numpy as np -from copy import deepcopy - -import paddle.fluid as F -import paddle.fluid.layers as L -import paddle.fluid.dygraph as D - -from tqdm import tqdm - -from ernie.modeling_ernie import ErnieModel, ErnieModelForPretraining, ErnieModelForGeneration -from ernie.modeling_ernie import _build_linear, _build_ln, append_name -from ernie.tokenizing_ernie import ErnieTokenizer -from ernie.optimization import AdamW, LinearDecay - -from demo.seq2seq.decode import beam_search_infilling, post_process - -from propeller import log -import propeller.paddle as propeller - -logging.getLogger().handlers[0]=log.handlers[0] -logging.getLogger().setLevel(logging.DEBUG) -log = logging.getLogger() - - -@np.vectorize -def rev_lookup(i): - return rev_dict[i] - -def evaluate(model, datasets, step, args): - did = D.parallel.Env().dev_id - place = F.CUDAPlace(D.parallel.Env().dev_id) - with open(os.path.join(args.predict_output_dir, 'pred.step%d.%d' % (step, did)), 'w') as outf: - for step, data in enumerate(datasets.start(place)): - (example_id, src_ids, src_sids, src_pids, - _, _, _, - _, - _, _, _, _) = data # never use target when infer - output_ids = beam_search_infilling(model, src_ids, src_sids, - eos_id=tokenizer.sep_id, - sos_id=tokenizer.cls_id, - attn_id=tokenizer.vocab[args.attn_token], - max_decode_len=args.max_decode_len, - max_encode_len=args.max_encode_len, - beam_width=args.beam_width, - length_penalty=args.length_penalty, - tgt_type_id=args.tgt_type_id,) - output_str = rev_lookup(output_ids.numpy()) - for eid, ostr in zip(example_id.numpy().tolist(), output_str.tolist()): - if '[SEP]' in ostr: - ostr = ostr[: ostr.index('[SEP]')] - ostr = ''.join(map(post_process, ostr)) - print('%d\t%s' % (eid, ostr), file=outf) - - model.train() - - -def seq2seq(model, tokenizer, args): - log.info('Training starts with args: %r' % args) - attn_id = tokenizer.vocab[args.attn_token] - def gen_mask(batch_ids, mask_type='bidi', query_len=None, pad_value=0): - if query_len is None: - query_len = batch_ids.shape[1] - if mask_type != 'empty': - mask = (batch_ids != pad_value).astype(np.float32) - mask = np.tile(np.expand_dims(mask, 1), [1, query_len, 1]) - if mask_type == 'causal': - assert query_len == batch_ids.shape[1] - mask = np.tril(mask) - elif mask_type == 'causal_without_diag': - assert query_len == batch_ids.shape[1] - mask = np.tril(mask, -1) - elif mask_type == 'diag': - assert query_len == batch_ids.shape[1] - mask = np.stack([np.diag(np.diag(m)) for m in mask], 0) - else: - mask_type == 'empty' - mask = np.zeros_like(batch_ids).astype(np.float32) - mask = np.tile(np.expand_dims(mask, 1), [1, query_len, 1]) - return mask - - def make_some_noice(ids): - if args.use_random_noice: - noice_ids = np.random.randint(1, len(tokenizer.vocab), size=ids.shape) - else: - noice_ids = np.ones_like(ids) * tokenizer.vocab['[NOISE]'] - pos, = np.where(np.ones_like(ids)) - np.random.shuffle(pos) - pos = pos[: int(args.noise_prob * len(pos))] - ids[pos,] = noice_ids[pos,] - return ids - - def map_fn(example_id, src_ids, tgt_ids): - src_ids = src_ids[: args.max_encode_len] - tgt_ids = tgt_ids[: args.max_decode_len] - src_ids, src_sids = tokenizer.build_for_ernie(src_ids) - src_pids = np.arange(len(src_ids)) - - tgt_ids, tgt_sids = tokenizer.build_for_ernie(tgt_ids) - tgt_pids = np.arange(len(tgt_ids)) + len(src_ids) # continues position - tgt_sids = np.ones_like(tgt_sids) * args.tgt_type_id - - attn_ids = np.ones_like(tgt_ids) * attn_id - if args.noise_prob > 0.: - tgt_labels = deepcopy(tgt_ids) - tgt_ids = make_some_noice(tgt_ids) #corrupted - else: - tgt_labels = tgt_ids - - return (example_id, src_ids, src_pids, src_sids, - tgt_ids, tgt_pids, tgt_sids, - attn_ids, tgt_labels) - - def after_padding(example_id, src_ids, src_pids, src_sids, - tgt_ids, tgt_pids, tgt_sids, - attn_ids, tgt_labels): - ''' - attention mask: - *** src, tgt, attn - src 00, 01, 11 - tgt 10, 11, 12 - attn 20, 21, 22 - - *** s1, s2 | t1 t2 t3| attn1 attn2 attn3 - s1 1, 1 | 0, 0, 0,| 0, 0, 0, - s2 1, 1 | 0, 0, 0,| 0, 0, 0, - - - t1 1, 1, | 1, 0, 0,| 0, 0, 0, - t2 1, 1, | 1, 1, 0,| 0, 0, 0, - t3 1, 1, | 1, 1, 1,| 0, 0, 0, - - - attn1 1, 1, | 0, 0, 0,| 1, 0, 0, - attn2 1, 1, | 1, 0, 0,| 0, 1, 0, - attn3 1, 1, | 1, 1, 0,| 0, 0, 1, - - for details, see Fig3. https://arxiv.org/abs/2001.11314 - ''' - - src_len = src_ids.shape[1] - tgt_len = tgt_ids.shape[1] - mask_00 = gen_mask(src_ids, 'bidi', query_len=src_len) - mask_01 = gen_mask(tgt_ids, 'empty', query_len=src_len) - mask_02 = gen_mask(attn_ids,'empty', query_len=src_len) - - mask_10 = gen_mask(src_ids, 'bidi', query_len=tgt_len) - mask_11 = gen_mask(tgt_ids, 'causal', query_len=tgt_len) - mask_12 = gen_mask(attn_ids, 'empty', query_len=tgt_len) - - mask_20 = gen_mask(src_ids, 'bidi', query_len=tgt_len) - mask_21 = gen_mask(tgt_ids, 'causal_without_diag', query_len=tgt_len) - mask_22 = gen_mask(attn_ids, 'diag', query_len=tgt_len) - - ''' - mask = np.concatenate([ - np.concatenate([mask_00, mask_01, mask_02], 2), - np.concatenate([mask_10, mask_11, mask_12], 2), - np.concatenate([mask_20, mask_21, mask_22], 2), - ], 1) - - ids = np.concatenate([src_ids, tgt_ids, attn_ids], 1) - pids = np.concatenate([src_pids, tgt_pids, tgt_pids], 1) - sids = np.concatenate([src_sids, tgt_sids, tgt_sids], 1) - - ''' - - mask_src_2_src = mask_00 - mask_tgt_2_srctgt = np.concatenate([mask_10, mask_11], 2) - mask_attn_2_srctgtattn = np.concatenate([mask_20, mask_21, mask_22], 2) - - - tgt_labels = tgt_labels[np.where(tgt_labels != 0)] - return (example_id, src_ids, src_sids, src_pids, - tgt_ids, tgt_sids, tgt_pids, - attn_ids, - mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels) - - bytes_vocab = {k.encode('utf8'): v for k, v in tokenizer.vocab.items()} - feature_column = propeller.data.FeatureColumns([ - propeller.data.LabelColumn('id'), - propeller.data.TextColumn('src', unk_id=tokenizer.unk_id, vocab_dict=bytes_vocab), - propeller.data.TextColumn('tgt', unk_id=tokenizer.unk_id, vocab_dict=bytes_vocab), - ]) - - train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=False, repeat=True, use_gz=False) \ - .map(map_fn) - - dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \ - .map(map_fn) \ - .padded_batch(args.eval_bsz) \ - .map(after_padding) - - log.debug('shard %d of %d'%(D.parallel.Env().dev_id, D.parallel.Env().nranks)) - train_ds = train_ds.shard(D.parallel.Env().nranks, D.parallel.Env().dev_id).shuffle(10000).padded_batch(args.bsz).map(after_padding) - dev_ds = dev_ds.shard(D.parallel.Env().nranks, D.parallel.Env().dev_id) - - shapes = [[None, None]] * 7 + [[None, None, None]] * 3 +[[None]] - types = ['int64'] * 11 - - train_ds.data_shapes = shapes - train_ds.data_types = types - dev_ds.data_shapes = shapes - dev_ds.data_types = types - - vocab_size, _ = model.word_emb.weight.shape - ctx = D.parallel.prepare_context() - model = D.parallel.DataParallel(model, ctx) - g_clip = F.clip.GradientClipByGlobalNorm(1.0) - opt = AdamW(learning_rate=LinearDecay(args.lr, int(args.warmup_proportion * args.max_steps), args.max_steps), parameter_list=model.parameters(), weight_decay=args.wd, grad_clip=g_clip) - attn_id = tokenizer.vocab[args.attn_token] - for step, data in enumerate(train_ds.start(place)): - (example_id, src_ids, src_sids, src_pids, - tgt_ids, tgt_sids, tgt_pids, - attn_ids, - mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels) = data - - _, __, info = model(src_ids, sent_ids=src_sids, pos_ids=src_pids, attn_bias=mask_src_2_src, encode_only=True) - cached_k, cached_v = info['caches'] - _, __, info = model(tgt_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_tgt_2_srctgt, past_cache=(cached_k, cached_v), encode_only=True) - cached_k2, cached_v2 = info['caches'] - past_cache_k = [L.concat([k, k2], 1) for k, k2 in zip(cached_k, cached_k2)] - past_cache_v = [L.concat([v, v2], 1) for v, v2 in zip(cached_v, cached_v2)] - if args.label_smooth > 0.: - tgt_labels = L.label_smooth(F.one_hot(tgt_labels, vocab_size), epsilon=args.label_smooth) - loss, _, __ = model(attn_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_attn_2_srctgtattn, - past_cache=(past_cache_k, past_cache_v), - tgt_labels=tgt_labels, - tgt_pos=L.where(attn_ids==attn_id)) - - scaled_loss = model.scale_loss(loss) - scaled_loss.backward() - model.apply_collective_grads() - opt.minimize(scaled_loss) - model.clear_gradients() - if step % 10 == 0: - loss = loss.numpy() - ppl = np.exp(loss) - log.debug('[step %d]train loss %.5f, ppl %.5f, lr %.3e' % (step, loss, ppl, opt.current_step_lr())) - if args.save_dir is not None and step % 1000 == 0 and D.parallel.Env().dev_id == 0: - F.save_dygraph(model.state_dict(), args.save_dir) - if args.predict_output_dir is not None and step > args.skip_eval_steps and step % args.eval_steps == 0: - assert os.path.exists(args.predict_output_dir), 'predict_output_dir not found: %s' % args.predict_output_dir - log.debug('doing predict on gpu %d...' % D.parallel.Env().dev_id) - evaluate(model, dev_ds, step, args) - if step > args.max_steps: - break - evaluate(model, dev_ds, step, args) - - if args.save_dir is not None: - F.save_dygraph(model.state_dict(), args.save_dir) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser('seq2seq model with ERNIE') - parser.add_argument('--from_pretrained', type=str, required=True, help='pretrained model directory or tag') - parser.add_argument('--bsz', type=int, default=8, help='batchsize') - parser.add_argument('--eval_bsz', type=int, default=20, help='batchsize') - parser.add_argument('--data_dir', type=str, required=True, help='data directory includes train / develop data') - parser.add_argument('--max_steps', type=int, required=True, help='max_train_steps, set this to EPOCH * NUM_SAMPLES / BATCH_SIZE') - parser.add_argument('--eval_steps', type=int, default=5000, help='evaluation frequency') - parser.add_argument('--skip_eval_steps', type=int, default=1, help='skip evaluate for first n step') - parser.add_argument('--max_encode_len', type=int, default=640) - parser.add_argument('--max_decode_len', type=int, default=120) - parser.add_argument('--tgt_type_id', type=int, default=3) - parser.add_argument('--warmup_proportion', type=float, default=0.1) - parser.add_argument('--beam_width', type=int, default=5) - parser.add_argument('--noise_prob', type=float, default=0.7, help='probability of token be repalced') - parser.add_argument('--use_random_noice', action='store_true', help='if set, replace target tokens with random token from vocabulary, else replace with `[NOISE]`') - parser.add_argument('--lr', type=float, default=5e-5, help='learning rate') - parser.add_argument('--label_smooth', type=float, default=0.1) - parser.add_argument('--length_penalty', type=float, default=1.0) - parser.add_argument('--predict_output_dir', type=str, default=None, help='predict file output directory') - parser.add_argument('--attn_token', type=str, default='[ATTN]', help='if [ATTN] not in vocab, you can specified [MAKK] as attn-token') - parser.add_argument('--inference_model_dir', type=str, default=None, help='inference model output directory') - parser.add_argument('--init_checkpoint', type=str, default=None, help='checkpoint to warm start from') - parser.add_argument('--save_dir', type=str, default=None, help='model output directory') - parser.add_argument('--wd', type=float, default=0.01, help='weight decay, aka L2 regularizer') - - args = parser.parse_args() - - place = F.CUDAPlace(D.parallel.Env().dev_id) - D.guard(place).__enter__() - - ernie = ErnieModelForGeneration.from_pretrained(args.from_pretrained) - tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained, mask_token=None) - rev_dict = {v: k for k, v in tokenizer.vocab.items()} - rev_dict[tokenizer.pad_id] = '' # replace [PAD] - rev_dict[tokenizer.unk_id] = '' # replace [PAD] - - if args.init_checkpoint is not None: - log.info('loading checkpoint from %s' % args.init_checkpoint) - sd, _ = D.load_dygraph(args.init_checkpoint) - ernie.set_dict(sd) - - seq2seq(ernie, tokenizer, args) diff --git a/demo/utils.py b/demo/utils.py new file mode 100644 index 0000000..154b141 --- /dev/null +++ b/demo/utils.py @@ -0,0 +1,44 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from __future__ import absolute_import +from __future__ import print_function +from __future__ import unicode_literals + +import sys +import argparse +import logging +import paddle + + +class UnpackDataLoader(paddle.io.DataLoader): + def __init__(self, *args, **kwargs): + super(UnpackDataLoader, self).__init__(*args, batch_size=1, **kwargs) + + def __iter__(self): + return ([yy[0] for yy in y] + for y in super(UnpackDataLoader, self).__iter__()) + + +def create_if_not_exists(dir): + try: + dir.mkdir(parents=True) + except FileExistsError: + pass + return dir + + +def get_warmup_and_linear_decay(max_steps, warmup_steps): + return lambda step: min(step / warmup_steps, 1. - (step - warmup_steps) / (max_steps - warmup_steps)) diff --git a/distill/distill.py b/distill/distill.py deleted file mode 100644 index 534d895..0000000 --- a/distill/distill.py +++ /dev/null @@ -1,239 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import os - -import numpy as np -from sklearn.metrics import f1_score -import paddle as P -import paddle.fluid as F -import paddle.fluid.layers as L -import paddle.fluid.dygraph as D -import propeller.paddle as propeller - -from ernie.tokenizing_ernie import ErnieTokenizer -from ernie.modeling_ernie import ErnieModelForSequenceClassification -from ernie.optimization import AdamW, LinearDecay - - -# 本例子采用chnsenticorp中文情感识别任务作为示范;并且事先通过数据增强扩充了蒸馏所需的无监督数据 -# -# 下载数据;并存放在 ./chnsenticorp-data/ -# 数据分为3列:原文;空格切词;情感标签 -# 其中第一列为ERNIE的输入;第二列为BoW词袋模型的输入 -# 事先统计好的BoW 词典在 ./chnsenticorp-data/vocab.bow.txt - -# 定义finetune teacher模型所需要的超参数 -DATA_DIR='./chnsenticorp-data/' -SEQLEN=256 -BATCH=32 -EPOCH=10 -LR=5e-5 - -tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0') - -student_vocab = {i.strip(): l for l, i in enumerate(open(os.path.join(DATA_DIR, 'vocab.bow.txt')).readlines())} - -def space_tokenizer(i): - return i.decode('utf8').split() - -feature_column = propeller.data.FeatureColumns([ - propeller.data.TextColumn('seg_a', unk_id=tokenizer.unk_id, vocab_dict=tokenizer.vocab, tokenizer=tokenizer.tokenize), - propeller.data.TextColumn('seg_a_student', unk_id=student_vocab['[UNK]'], vocab_dict=student_vocab, tokenizer=space_tokenizer), - propeller.data.LabelColumn('label', vocab_dict={ - b"0": 0, - b"1": 1, - }), -]) - -def map_fn(seg_a, seg_a_student, label): - seg_a, _ = tokenizer.truncate(seg_a, [], seqlen=SEQLEN) - sentence, segments = tokenizer.build_for_ernie(seg_a) - return seg_a_student, sentence, segments, label - - -train_ds = feature_column.build_dataset('train', data_dir=os.path.join(DATA_DIR, 'train/'), shuffle=True, repeat=False, use_gz=False) \ - .map(map_fn) \ - .padded_batch(BATCH) - -train_ds_unlabel = feature_column.build_dataset('train-da', data_dir=os.path.join(DATA_DIR, 'train-data-augmented/'), shuffle=True, repeat=False, use_gz=False) \ - .map(map_fn) \ - .padded_batch(BATCH) - -dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(DATA_DIR, 'dev/'), shuffle=False, repeat=False, use_gz=False) \ - .map(map_fn) \ - .padded_batch(BATCH,) - -shapes = ([-1,SEQLEN],[-1,SEQLEN], [-1, SEQLEN], [-1]) -types = ('int64', 'int64', 'int64', 'int64') - -train_ds.data_shapes = shapes -train_ds.data_types = types -train_ds_unlabel.data_shapes = shapes -train_ds_unlabel.data_types = types -dev_ds.data_shapes = shapes -dev_ds.data_types = types - -place = F.CUDAPlace(0) -D.guard(place).__enter__() - -def evaluate_teacher(model, dataset): - all_pred, all_label = [], [] - with D.base._switch_tracer_mode_guard_(is_train=False): - model.eval() - for step, (ids_student, ids, _, labels) in enumerate(dataset.start()): - _, logits = model(ids) - pred = L.argmax(logits, -1) - all_pred.extend(pred.numpy()) - all_label.extend(labels.numpy()) - f1 = f1_score(all_label, all_pred, average='macro') - model.train() - return f1 - - -teacher_model = ErnieModelForSequenceClassification.from_pretrained('ernie-1.0', num_labels=2) -teacher_model.train() -if not os.path.exists('./teacher_model.pdparams'): - g_clip = F.clip.GradientClipByGlobalNorm(1.0) - opt = AdamW(learning_rate=LinearDecay(LR, 9600*EPOCH*0.1/BATCH, 9600*EPOCH/BATCH), parameter_list=teacher_model.parameters(), weight_decay=0.01, grad_clip=g_clip) - for epoch in range(EPOCH): - for step, (ids_student, ids, sids, labels) in enumerate(train_ds.start(place)): - loss, logits = teacher_model(ids, labels=labels) - loss.backward() - if step % 10 == 0: - print('[step %03d] teacher train loss %.5f lr %.3e' % (step, loss.numpy(), opt.current_step_lr())) - opt.minimize(loss) - teacher_model.clear_gradients() - if step % 100 == 0: - f1 = evaluate_teacher(teacher_model, dev_ds) - print('teacher f1: %.5f' %f1) - D.save_dygraph(teacher_model.state_dict(), './teacher_model') -else: - state_dict, _ = D.load_dygraph('./teacher_model') - teacher_model.set_dict(state_dict) - f1 = evaluate_teacher(teacher_model, dev_ds) - print('teacher f1: %.5f' %f1) - - -# 定义finetune student 模型所需要的超参数 -SEQLEN=256 -BATCH=100 -EPOCH=10 -LR=1e-4 - - -def evaluate_student(model, dataset): - all_pred, all_label = [], [] - with D.base._switch_tracer_mode_guard_(is_train=False): - model.eval() - for step, (ids_student, ids, _, labels) in enumerate(dataset.start()): - _, logits = model(ids_student) - pred = L.argmax(logits, -1) - all_pred.extend(pred.numpy()) - all_label.extend(labels.numpy()) - f1 = f1_score(all_label, all_pred, average='macro') - model.train() - return f1 - - -class BOW(D.Layer): - def __init__(self): - super().__init__() - self.emb = D.Embedding([len(student_vocab), 128], padding_idx=0) - self.fc = D.Linear(128, 2) - def forward(self, ids, labels=None): - embbed = self.emb(ids) - pad_mask = L.unsqueeze(L.cast(ids!=0, 'float32'), [-1]) - - embbed = L.reduce_sum(embbed * pad_mask, 1) - embbed = L.softsign(embbed) - logits = self.fc(embbed) - if labels is not None: - if len(labels.shape)==1: - labels = L.reshape(labels, [-1, 1]) - loss = L.softmax_with_cross_entropy(logits, labels) - loss = L.reduce_mean(loss) - else: - loss = None - return loss, logits - -class CNN(D.Layer): - def __init__(self): - super().__init__() - self.emb = D.Embedding([30002, 128], padding_idx=0) - self.cnn = D.Conv2D(128, 128, (1, 3), padding=(0, 1), act='relu') - self.pool = D.Pool2D((1, 3), pool_padding=(0, 1)) - self.fc = D.Linear(128, 2) - def forward(self, ids, labels=None): - embbed = self.emb(ids) - #d_batch, d_seqlen = ids.shape - hidden = embbed - hidden = L.transpose(hidden, [0, 2, 1]) #change to NCWH - hidden = L.unsqueeze(hidden, [2]) - hidden = self.cnn(hidden) - hidden = self.pool(hidden) - hidden = L.squeeze(hidden, [2]) - hidden = L.transpose(hidden, [0, 2, 1]) - pad_mask = L.unsqueeze(L.cast(ids!=0, 'float32'), [-1]) - hidden = L.softsign(L.reduce_sum(hidden * pad_mask, 1)) - logits = self.fc(hidden) - if labels is not None: - if len(labels.shape)==1: - labels = L.reshape(labels, [-1, 1]) - loss = L.softmax_with_cross_entropy(logits, labels) - loss = L.reduce_mean(loss) - else: - loss = None - return loss, logits - -def KL(pred, target): - pred = L.log(L.softmax(pred)) - target = L.softmax(target) - loss = L.kldiv_loss(pred, target) - return loss - -teacher_model.eval() -model = BOW() -g_clip = F.clip.GradientClipByGlobalNorm(1.0) #experimental -opt = AdamW(learning_rate=LR, parameter_list=model.parameters(), weight_decay=0.01, grad_clip=g_clip) -model.train() -for epoch in range(EPOCH): - for step, (ids_student, ids, sids, label) in enumerate(train_ds.start(place)): - _, logits_t = teacher_model(ids, sids) # teacher 模型输出logits - logits_t.stop_gradient=True - _, logits_s = model(ids_student) # student 模型输出logits - loss_ce, _ = model(ids_student, labels=label) - loss_kd = KL(logits_s, logits_t) # 由KL divergence度量两个分布的距离 - loss = loss_ce + loss_kd - loss.backward() - if step % 10 == 0: - print('[step %03d] distill train loss %.5f lr %.3e' % (step, loss.numpy(), opt.current_step_lr())) - opt.minimize(loss) - model.clear_gradients() - f1 = evaluate_student(model, dev_ds) - print('student f1 %.5f' % f1) - -# 最后再加一轮hard label训练巩固结果 -for step, (ids_student, ids, sids, label) in enumerate(train_ds.start(place)): - loss, _ = model(ids_student, labels=label) - loss.backward() - if step % 10 == 0: - print('[step %03d] train loss %.5f lr %.3e' % (step, loss.numpy(), opt.current_step_lr())) - opt.minimize(loss) - model.clear_gradients() - -f1 = evaluate_student(model, dev_ds) -print('final f1 %.5f' % f1) - diff --git a/ernie-gen/README.md b/ernie-gen/README.md index cb25e45..3be3484 100644 --- a/ernie-gen/README.md +++ b/ernie-gen/README.md @@ -1,5 +1,5 @@ ```bash - _____ ____ _ _ ___ _____ ____ _____ _ _ + _____ ____ _ _ ___ _____ ____ _____ _ _ | ____| _ \| \ | |_ _| ____| / ___| ____| \ | | | _| | |_) | \| || || _| _____| | _| _| | \| | | |___| _ <| |\ || || |__|_____| |_| | |___| |\ | diff --git a/ernie-vil/README.md b/ernie-vil/README.md index a072e00..663ae64 100644 --- a/ernie-vil/README.md +++ b/ernie-vil/README.md @@ -1,3 +1,3 @@ -![ernie_vil](.meta/ernie-vil.png) +![ernie_vil](.meta/ernie-vil.png) The `ERNIE-ViL` (including our pre-trained models and VCR task-pretrained models) has been released at [here](https://github.com/PaddlePaddle/ERNIE/tree/repro/ernie-vil). diff --git a/ernie/__init__.py b/ernie/__init__.py index db8bc37..9171890 100644 --- a/ernie/__init__.py +++ b/ernie/__init__.py @@ -18,16 +18,13 @@ from __future__ import print_function from __future__ import unicode_literals import paddle -paddle_version = [int(i) for i in paddle.__version__.split('.')] -if paddle_version[1] < 7: - raise RuntimeError('paddle-ernie requires paddle 1.7+, got %s' % +if paddle.__version__ != '0.0.0' and paddle.__version__ < '2.0.0': + raise RuntimeError('propeller 0.2 requires paddle 2.0+, got %s' % paddle.__version__) from ernie.modeling_ernie import ErnieModel -from ernie.modeling_ernie import (ErnieModelForSequenceClassification, - ErnieModelForTokenClassification, - ErnieModelForQuestionAnswering, - ErnieModelForPretraining) +from ernie.modeling_ernie import ( + ErnieModelForSequenceClassification, ErnieModelForTokenClassification, + ErnieModelForQuestionAnswering, ErnieModelForPretraining) from ernie.tokenizing_ernie import ErnieTokenizer, ErnieTinyTokenizer - diff --git a/ernie/file_utils.py b/ernie/file_utils.py index b63d091..e55a5bc 100644 --- a/ernie/file_utils.py +++ b/ernie/file_utils.py @@ -28,7 +28,10 @@ else: log = logging.getLogger(__name__) -def _fetch_from_remote(url, force_download=False, cached_dir='~/.paddle-ernie-cache'): + +def _fetch_from_remote(url, + force_download=False, + cached_dir='~/.paddle-ernie-cache'): import hashlib, tempfile, requests, tarfile sig = hashlib.md5(url.encode('utf8')).hexdigest() cached_dir = Path(cached_dir).expanduser() @@ -44,15 +47,16 @@ def _fetch_from_remote(url, force_download=False, cached_dir='~/.paddle-ernie-ca #url = 'https://ernie.bj.bcebos.com/ERNIE_stable.tgz' r = requests.get(url, stream=True) total_len = int(r.headers.get('content-length')) - for chunk in tqdm(r.iter_content(chunk_size=1024), - total=total_len // 1024, - desc='downloading %s' % url, + for chunk in tqdm( + r.iter_content(chunk_size=1024), + total=total_len // 1024, + desc='downloading %s' % url, unit='KB'): if chunk: - f.write(chunk) + f.write(chunk) f.flush() log.debug('extacting... to %s' % tmpfile) - with tarfile.open(tmpfile.as_posix()) as tf: + with tarfile.open(tmpfile.as_posix()) as tf: tf.extractall(path=cached_dir_model.as_posix()) os.remove(tmpfile.as_posix()) log.debug('%s cached in %s' % (url, cached_dir)) @@ -63,5 +67,5 @@ def add_docstring(doc): def func(f): f.__doc__ += ('\n======other docs from supper class ======\n%s' % doc) return f - return func + return func diff --git a/ernie/modeling_ernie.py b/ernie/modeling_ernie.py index 415e956..b8b5e8e 100644 --- a/ernie/modeling_ernie.py +++ b/ernie/modeling_ernie.py @@ -17,68 +17,81 @@ from __future__ import absolute_import from __future__ import print_function from __future__ import unicode_literals -import sys -import os -import argparse import json import logging -import logging -from functools import partial import six if six.PY2: from pathlib2 import Path else: from pathlib import Path -import paddle.fluid.dygraph as D -import paddle.fluid as F -import paddle.fluid.layers as L - +import paddle as P +from paddle import nn +from paddle.nn import functional as F from ernie.file_utils import _fetch_from_remote, add_docstring log = logging.getLogger(__name__) -def _build_linear(n_in, n_out, name, init, act=None): - return D.Linear(n_in, - n_out, - param_attr=F.ParamAttr(name='%s.w_0' % name if name is not None else None, initializer=init), - bias_attr='%s.b_0' % name if name is not None else None, act=act) +ACT_DICT = { + 'relu': nn.ReLU, + 'gelu': nn.GELU, +} + + +def _build_linear(n_in, n_out, name, init): + return nn.Linear( + n_in, + n_out, + weight_attr=P.ParamAttr( + name='%s.w_0' % name if name is not None else None, + initializer=init), + bias_attr='%s.b_0' % name if name is not None else None, ) def _build_ln(n_in, name): - return D.LayerNorm(normalized_shape=n_in, - param_attr=F.ParamAttr(name='%s_layer_norm_scale' % name if name is not None else None, - initializer=F.initializer.Constant(1.)), - bias_attr=F.ParamAttr(name='%s_layer_norm_bias' % name if name is not None else None, - initializer=F.initializer.Constant(1.)), - ) + return nn.LayerNorm( + normalized_shape=n_in, + weight_attr=P.ParamAttr( + name='%s_layer_norm_scale' % name if name is not None else None, + initializer=nn.initializer.Constant(1.)), + bias_attr=P.ParamAttr( + name='%s_layer_norm_bias' % name if name is not None else None, + initializer=nn.initializer.Constant(0.)), ) def append_name(name, postfix): if name is None: - return None + ret = None elif name == '': - return postfix + ret = postfix else: - return '%s_%s' % (name, postfix) + ret = '%s_%s' % (name, postfix) + return ret -class AttentionLayer(D.Layer): +class AttentionLayer(nn.Layer): def __init__(self, cfg, name=None): super(AttentionLayer, self).__init__() - initializer = F.initializer.TruncatedNormal(scale=cfg['initializer_range']) + initializer = nn.initializer.TruncatedNormal( + std=cfg['initializer_range']) d_model = cfg['hidden_size'] n_head = cfg['num_attention_heads'] assert d_model % n_head == 0 - d_model_q = cfg.get('query_hidden_size_per_head', d_model // n_head) * n_head - d_model_v = cfg.get('value_hidden_size_per_head', d_model // n_head) * n_head + d_model_q = cfg.get('query_hidden_size_per_head', + d_model // n_head) * n_head + d_model_v = cfg.get('value_hidden_size_per_head', + d_model // n_head) * n_head self.n_head = n_head self.d_key = d_model_q // n_head - self.q = _build_linear(d_model, d_model_q, append_name(name, 'query_fc'), initializer) - self.k = _build_linear(d_model, d_model_q, append_name(name, 'key_fc'), initializer) - self.v = _build_linear(d_model, d_model_v, append_name(name, 'value_fc'), initializer) - self.o = _build_linear(d_model_v, d_model, append_name(name, 'output_fc'), initializer) - self.dropout = lambda i: L.dropout(i, dropout_prob=cfg['attention_probs_dropout_prob'], dropout_implementation="upscale_in_train",) if self.training else i + self.q = _build_linear(d_model, d_model_q, + append_name(name, 'query_fc'), initializer) + self.k = _build_linear(d_model, d_model_q, + append_name(name, 'key_fc'), initializer) + self.v = _build_linear(d_model, d_model_v, + append_name(name, 'value_fc'), initializer) + self.o = _build_linear(d_model_v, d_model, + append_name(name, 'output_fc'), initializer) + self.dropout = nn.Dropout(p=cfg['attention_probs_dropout_prob']) def forward(self, queries, keys, values, attn_bias, past_cache): assert len(queries.shape) == len(keys.shape) == len(values.shape) == 3 @@ -94,66 +107,77 @@ class AttentionLayer(D.Layer): cache = (k, v) if past_cache is not None: cached_k, cached_v = past_cache - k = L.concat([cached_k, k], 1) - v = L.concat([cached_v, v], 1) - - q = L.transpose(L.reshape(q, [0, 0, self.n_head, q.shape[-1] // self.n_head]), [0, 2, 1, 3]) #[batch, head, seq, dim] - k = L.transpose(L.reshape(k, [0, 0, self.n_head, k.shape[-1] // self.n_head]), [0, 2, 1, 3]) #[batch, head, seq, dim] - v = L.transpose(L.reshape(v, [0, 0, self.n_head, v.shape[-1] // self.n_head]), [0, 2, 1, 3]) #[batch, head, seq, dim] - - - q = L.scale(q, scale=self.d_key ** -0.5) - score = L.matmul(q, k, transpose_y=True) + k = P.concat([cached_k, k], 1) + v = P.concat([cached_v, v], 1) + + q = q.reshape( + [0, 0, self.n_head, q.shape[-1] // self.n_head]).transpose( + [0, 2, 1, 3]) #[batch, head, seq, dim] + k = k.reshape( + [0, 0, self.n_head, k.shape[-1] // self.n_head]).transpose( + [0, 2, 1, 3]) #[batch, head, seq, dim] + v = v.reshape( + [0, 0, self.n_head, v.shape[-1] // self.n_head]).transpose( + [0, 2, 1, 3]) #[batch, head, seq, dim] + + q = q.scale(self.d_key**-0.5) + score = q.matmul(k, transpose_y=True) if attn_bias is not None: score += attn_bias - score = L.softmax(score, use_cudnn=True) + score = F.softmax(score) score = self.dropout(score) - out = L.matmul(score, v) - out = L.transpose(out, [0, 2, 1, 3]) - out = L.reshape(out, [0, 0, out.shape[2] * out.shape[3]]) - + out = score.matmul(v).transpose([0, 2, 1, 3]) + out = out.reshape([0, 0, out.shape[2] * out.shape[3]]) out = self.o(out) return out, cache -class PositionwiseFeedForwardLayer(D.Layer): +class PositionwiseFeedForwardLayer(nn.Layer): def __init__(self, cfg, name=None): super(PositionwiseFeedForwardLayer, self).__init__() - initializer = F.initializer.TruncatedNormal(scale=cfg['initializer_range']) + initializer = nn.initializer.TruncatedNormal( + std=cfg['initializer_range']) d_model = cfg['hidden_size'] d_ffn = cfg.get('intermediate_size', 4 * d_model) - assert cfg['hidden_act'] in ['relu', 'gelu'] - self.i = _build_linear(d_model, d_ffn, append_name(name, 'fc_0'), initializer, act=cfg['hidden_act']) - self.o = _build_linear(d_ffn, d_model, append_name(name, 'fc_1'), initializer) + self.act = ACT_DICT[cfg['hidden_act']]() + self.i = _build_linear( + d_model, + d_ffn, + append_name(name, 'fc_0'), + initializer, ) + self.o = _build_linear(d_ffn, d_model, + append_name(name, 'fc_1'), initializer) prob = cfg.get('intermediate_dropout_prob', 0.) - self.dropout = lambda i: L.dropout(i, dropout_prob=prob, dropout_implementation="upscale_in_train",) if self.training else i + self.dropout = nn.Dropout(p=prob) def forward(self, inputs): - hidden = self.i(inputs) + hidden = self.act(self.i(inputs)) hidden = self.dropout(hidden) out = self.o(hidden) return out -class ErnieBlock(D.Layer): +class ErnieBlock(nn.Layer): def __init__(self, cfg, name=None): super(ErnieBlock, self).__init__() d_model = cfg['hidden_size'] - initializer = F.initializer.TruncatedNormal(scale=cfg['initializer_range']) - - self.attn = AttentionLayer(cfg, name = append_name(name, 'multi_head_att')) - self.ln1 = _build_ln(d_model, name = append_name(name, 'post_att')) - self.ffn = PositionwiseFeedForwardLayer(cfg, name = append_name(name, 'ffn')) - self.ln2 = _build_ln(d_model, name = append_name(name, 'post_ffn')) + self.attn = AttentionLayer( + cfg, name=append_name(name, 'multi_head_att')) + self.ln1 = _build_ln(d_model, name=append_name(name, 'post_att')) + self.ffn = PositionwiseFeedForwardLayer( + cfg, name=append_name(name, 'ffn')) + self.ln2 = _build_ln(d_model, name=append_name(name, 'post_ffn')) prob = cfg.get('intermediate_dropout_prob', cfg['hidden_dropout_prob']) - self.dropout = lambda i: L.dropout(i, dropout_prob=prob, dropout_implementation="upscale_in_train",) if self.training else i + self.dropout = nn.Dropout(p=prob) def forward(self, inputs, attn_bias=None, past_cache=None): - attn_out, cache = self.attn(inputs, inputs, inputs, attn_bias, past_cache=past_cache) #self attn + attn_out, cache = self.attn( + inputs, inputs, inputs, attn_bias, + past_cache=past_cache) #self attn attn_out = self.dropout(attn_out) - hidden = attn_out + inputs - hidden = self.ln1(hidden) # dropout/ add/ norm + hidden = attn_out + inputs + hidden = self.ln1(hidden) # dropout/ add/ norm ffn_out = self.ffn(hidden) ffn_out = self.dropout(ffn_out) @@ -161,16 +185,22 @@ class ErnieBlock(D.Layer): hidden = self.ln2(hidden) return hidden, cache - -class ErnieEncoderStack(D.Layer): + +class ErnieEncoderStack(nn.Layer): def __init__(self, cfg, name=None): super(ErnieEncoderStack, self).__init__() n_layers = cfg['num_hidden_layers'] - self.block = D.LayerList([ErnieBlock(cfg, append_name(name, 'layer_%d' % i)) for i in range(n_layers)]) + self.block = nn.LayerList([ + ErnieBlock(cfg, append_name(name, 'layer_%d' % i)) + for i in range(n_layers) + ]) def forward(self, inputs, attn_bias=None, past_cache=None): if past_cache is not None: - assert isinstance(past_cache, tuple), 'unknown type of `past_cache`, expect tuple or list. got %s' % repr(type(past_cache)) + assert isinstance( + past_cache, tuple + ), 'unknown type of `past_cache`, expect tuple or list. got %s' % repr( + type(past_cache)) past_cache = list(zip(*past_cache)) else: past_cache = [None] * len(self.block) @@ -191,11 +221,15 @@ class PretrainedModel(object): resource_map = { 'ernie-1.0': bce + 'model-ernie1.0.1.tar.gz', 'ernie-2.0-en': bce + 'model-ernie2.0-en.1.tar.gz', - 'ernie-2.0-large-en': bce + 'model-ernie2.0-large-en.1.tar.gz', + 'ernie-2.0-large-en': bce + 'model-ernie2.0-large-en.1.tar.gz', 'ernie-tiny': bce + 'model-ernie_tiny.1.tar.gz', } + @classmethod - def from_pretrained(cls, pretrain_dir_or_url, force_download=False, **kwargs): + def from_pretrained(cls, + pretrain_dir_or_url, + force_download=False, + **kwargs): if not Path(pretrain_dir_or_url).exists() and pretrain_dir_or_url in cls.resource_map: url = cls.resource_map[pretrain_dir_or_url] log.info('get pretrain dir from %s' % url) @@ -206,41 +240,42 @@ class PretrainedModel(object): if not pretrain_dir.exists(): raise ValueError('pretrain dir not found: %s' % pretrain_dir) - param_path = pretrain_dir /'params' - state_dict_path = pretrain_dir / 'saved_weights' + state_dict_path = pretrain_dir / 'saved_weights.pdparams' config_path = pretrain_dir / 'ernie_config.json' if not config_path.exists(): raise ValueError('config path not found: %s' % config_path) - name_prefix=kwargs.pop('name', None) + name_prefix = kwargs.pop('name', None) cfg_dict = dict(json.loads(config_path.open().read()), **kwargs) model = cls(cfg_dict, name=name_prefix) - + log.info('loading pretrained model from %s' % pretrain_dir) + #param_path = pretrain_dir / 'params' #if os.path.exists(param_path): # raise NotImplementedError() # log.debug('load pretrained weight from program state') # F.io.load_program_state(param_path) #buggy in dygraph.gurad, push paddle to fix - if state_dict_path.with_suffix('.pdparams').exists(): - m, _ = D.load_dygraph(state_dict_path.as_posix()) + if state_dict_path.exists(): + m = P.load(state_dict_path) for k, v in model.state_dict().items(): if k not in m: log.warn('param:%s not set in pretrained model, skip' % k) - m[k] = v # FIXME: no need to do this in the future - model.set_dict(m) + m[k] = v # FIXME: no need to do this in the future + model.set_state_dict(m) else: - raise ValueError('weight file not found in pretrain dir: %s' % pretrain_dir) + raise ValueError('weight file not found in pretrain dir: %s' % + pretrain_dir) return model -class ErnieModel(D.Layer, PretrainedModel): +class ErnieModel(nn.Layer, PretrainedModel): def __init__(self, cfg, name=None): """ Fundamental pretrained Ernie model """ log.debug('init ErnieModel with config: %s' % repr(cfg)) - D.Layer.__init__(self) + nn.Layer.__init__(self) d_model = cfg['hidden_size'] d_emb = cfg.get('emb_size', cfg['hidden_size']) d_vocab = cfg['vocab_size'] @@ -248,54 +283,84 @@ class ErnieModel(D.Layer, PretrainedModel): d_sent = cfg.get("sent_type_vocab_size") or cfg['type_vocab_size'] self.n_head = cfg['num_attention_heads'] self.return_additional_info = cfg.get('return_additional_info', False) - initializer = F.initializer.TruncatedNormal(scale=cfg['initializer_range']) + initializer = nn.initializer.TruncatedNormal( + std=cfg['initializer_range']) self.ln = _build_ln(d_model, name=append_name(name, 'pre_encoder')) - self.word_emb = D.Embedding([d_vocab, d_emb], param_attr=F.ParamAttr(name=append_name(name, 'word_embedding'), initializer=initializer)) - self.pos_emb = D.Embedding([d_pos, d_emb], param_attr=F.ParamAttr(name=append_name(name, 'pos_embedding'), initializer=initializer)) - self.sent_emb = D.Embedding([d_sent, d_emb], param_attr=F.ParamAttr(name=append_name(name, 'sent_embedding'), initializer=initializer)) + self.word_emb = nn.Embedding( + d_vocab, + d_emb, + weight_attr=P.ParamAttr( + name=append_name(name, 'word_embedding'), + initializer=initializer)) + self.pos_emb = nn.Embedding( + d_pos, + d_emb, + weight_attr=P.ParamAttr( + name=append_name(name, 'pos_embedding'), + initializer=initializer)) + self.sent_emb = nn.Embedding( + d_sent, + d_emb, + weight_attr=P.ParamAttr( + name=append_name(name, 'sent_embedding'), + initializer=initializer)) prob = cfg['hidden_dropout_prob'] - self.dropout = lambda i: L.dropout(i, dropout_prob=prob, dropout_implementation="upscale_in_train",) if self.training else i + self.dropout = nn.Dropout(p=prob) - self.encoder_stack = ErnieEncoderStack(cfg, append_name(name, 'encoder')) + self.encoder_stack = ErnieEncoderStack(cfg, + append_name(name, 'encoder')) if cfg.get('has_pooler', True): - self.pooler = _build_linear(cfg['hidden_size'], cfg['hidden_size'], append_name(name, 'pooled_fc'), initializer, act='tanh') + self.pooler = _build_linear( + cfg['hidden_size'], + cfg['hidden_size'], + append_name(name, 'pooled_fc'), + initializer, ) else: self.pooler = None self.train() - + #FIXME:remove this def eval(self): - if F.in_dygraph_mode(): + if P.in_dynamic_mode(): super(ErnieModel, self).eval() self.training = False for l in self.sublayers(): l.training = False + return self def train(self): - if F.in_dygraph_mode(): + if P.in_dynamic_mode(): super(ErnieModel, self).train() self.training = True for l in self.sublayers(): l.training = True - - def forward(self, src_ids, sent_ids=None, pos_ids=None, input_mask=None, attn_bias=None, past_cache=None, use_causal_mask=False): + return self + + def forward(self, + src_ids, + sent_ids=None, + pos_ids=None, + input_mask=None, + attn_bias=None, + past_cache=None, + use_causal_mask=False): """ Args: - src_ids (`Variable` of shape `[batch_size, seq_len]`): + src_ids (`Variable` of shape `[batch_size, seq_len]`): Indices of input sequence tokens in the vocabulary. - sent_ids (optional, `Variable` of shape `[batch_size, seq_len]`): + sent_ids (optional, `Variable` of shape `[batch_size, seq_len]`): aka token_type_ids, Segment token indices to indicate first and second portions of the inputs. if None, assume all tokens come from `segment_a` - pos_ids(optional, `Variable` of shape `[batch_size, seq_len]`): + pos_ids(optional, `Variable` of shape `[batch_size, seq_len]`): Indices of positions of each input sequence tokens in the position embeddings. - input_mask(optional `Variable` of shape `[batch_size, seq_len]`): + input_mask(optional `Variable` of shape `[batch_size, seq_len]`): Mask to avoid performing attention on the padding token indices of the encoder input. - attn_bias(optional, `Variable` of shape `[batch_size, seq_len, seq_len] or False`): + attn_bias(optional, `Variable` of shape `[batch_size, seq_len, seq_len] or False`): 3D version of `input_mask`, if set, overrides `input_mask`; if set not False, will not apply attention mask past_cache(optional, tuple of two lists: cached key and cached value, each is a list of `Variable`s of shape `[batch_size, seq_len, hidden_size]`): - cached key/value tensor that will be concated to generated key/value when performing self attention. + cached key/value tensor that will be concated to generated key/value when performing self attention. if set, `attn_bias` should not be None. Returns: @@ -306,33 +371,38 @@ class ErnieModel(D.Layer, PretrainedModel): info (Dictionary): addtional middle level info, inclues: all hidden stats, k/v caches. """ - #d_batch, d_seqlen = src_ids.shape - assert len(src_ids.shape) == 2, 'expect src_ids.shape = [batch, sequecen], got %s' % (repr(src_ids.shape)) + assert len( + src_ids. + shape) == 2, 'expect src_ids.shape = [batch, sequecen], got %s' % ( + repr(src_ids.shape)) assert attn_bias is not None if past_cache else True, 'if `past_cache` is specified; attn_bias should not be None' - d_batch = L.shape(src_ids)[0] - d_seqlen = L.shape(src_ids)[1] + d_seqlen = P.shape(src_ids)[1] if pos_ids is None: - pos_ids = L.reshape(L.range(0, d_seqlen, 1, dtype='int32'), [1, -1]) - pos_ids = L.cast(pos_ids, 'int64') + pos_ids = P.arange( + 0, d_seqlen, 1, dtype='int32').reshape([1, -1]).cast('int64') if attn_bias is None: if input_mask is None: - input_mask = L.cast(src_ids != 0, 'float32') + input_mask = P.cast(src_ids != 0, 'float32') assert len(input_mask.shape) == 2 - input_mask = L.unsqueeze(input_mask, axes=[-1]) - attn_bias = L.matmul(input_mask, input_mask, transpose_y=True) + input_mask = input_mask.unsqueeze(-1) + attn_bias = input_mask.matmul(input_mask, transpose_y=True) if use_causal_mask: - sequence = L.reshape(L.range(0, d_seqlen, 1, dtype='float32') + 1., [1, 1, -1, 1]) - causal_mask = L.cast((L.matmul(sequence, 1. / sequence, transpose_y=True) >= 1.) , 'float32') + sequence = P.reshape( + P.arange( + 0, d_seqlen, 1, dtype='float32') + 1., [1, 1, -1, 1]) + causal_mask = (sequence.matmul( + 1. / sequence, transpose_y=True) >= 1.).cast('float32') attn_bias *= causal_mask else: - assert len(attn_bias.shape) == 3, 'expect attn_bias tobe rank 3, got %r' % attn_bias.shape + assert len( + attn_bias.shape + ) == 3, 'expect attn_bias tobe rank 3, got %r' % attn_bias.shape attn_bias = (1. - attn_bias) * -10000.0 - attn_bias = L.unsqueeze(attn_bias, [1]) - attn_bias = L.expand(attn_bias, [1, self.n_head, 1, 1]) # avoid broadcast =_= - attn_bias.stop_gradient = True - + attn_bias = attn_bias.unsqueeze(1).tile( + [1, self.n_head, 1, 1]) # avoid broadcast =_= + if sent_ids is None: - sent_ids = L.zeros_like(src_ids) + sent_ids = P.zeros_like(src_ids) src_embedded = self.word_emb(src_ids) pos_embedded = self.pos_emb(pos_ids) @@ -341,9 +411,10 @@ class ErnieModel(D.Layer, PretrainedModel): embedded = self.dropout(self.ln(embedded)) - encoded, hidden_list, cache_list = self.encoder_stack(embedded, attn_bias, past_cache=past_cache) + encoded, hidden_list, cache_list = self.encoder_stack( + embedded, attn_bias, past_cache=past_cache) if self.pooler is not None: - pooled = self.pooler(encoded[:, 0, :]) + pooled = F.tanh(self.pooler(encoded[:, 0, :])) else: pooled = None @@ -354,29 +425,32 @@ class ErnieModel(D.Layer, PretrainedModel): if self.return_additional_info: return pooled, encoded, additional_info - else: - return pooled, encoded - + return pooled, encoded + class ErnieModelForSequenceClassification(ErnieModel): """ - Ernie Model for text classfication or pointwise ranking tasks + Ernie Model for text classfication or pointwise ranking tasks """ def __init__(self, cfg, name=None): - super(ErnieModelForSequenceClassification, self).__init__(cfg, name=name) + super(ErnieModelForSequenceClassification, self).__init__( + cfg, name=name) - initializer = F.initializer.TruncatedNormal(scale=cfg['initializer_range']) - self.classifier = _build_linear(cfg['hidden_size'], cfg['num_labels'], append_name(name, 'cls'), initializer) + initializer = nn.initializer.TruncatedNormal( + std=cfg['initializer_range']) + self.classifier = _build_linear(cfg['hidden_size'], cfg['num_labels'], + append_name(name, 'cls'), initializer) prob = cfg.get('classifier_dropout_prob', cfg['hidden_dropout_prob']) - self.dropout = lambda i: L.dropout(i, dropout_prob=prob, dropout_implementation="upscale_in_train",) if self.training else i + self.dropout = nn.Dropout(p=prob) + self.train() @add_docstring(ErnieModel.forward.__doc__) def forward(self, *args, **kwargs): """ Args: - labels (optional, `Variable` of shape [batch_size]): + labels (optional, `Variable` of shape [batch_size]): ground truth label id for each sentence Returns: loss (`Variable` of shape []): @@ -386,15 +460,15 @@ class ErnieModelForSequenceClassification(ErnieModel): output logits of classifier """ labels = kwargs.pop('labels', None) - pooled, encoded = super(ErnieModelForSequenceClassification, self).forward(*args, **kwargs) + pooled, encoded = super(ErnieModelForSequenceClassification, + self).forward(*args, **kwargs) hidden = self.dropout(pooled) logits = self.classifier(hidden) if labels is not None: - if len(labels.shape) == 1: - labels = L.reshape(labels, [-1, 1]) - loss = L.softmax_with_cross_entropy(logits, labels) - loss = L.reduce_mean(loss) + if len(labels.shape) != 1: + labels = labels.squeeze() + loss = F.cross_entropy(logits, labels) else: loss = None return loss, logits @@ -404,20 +478,24 @@ class ErnieModelForTokenClassification(ErnieModel): """ Ernie Model for Named entity tasks(NER) """ + def __init__(self, cfg, name=None): super(ErnieModelForTokenClassification, self).__init__(cfg, name=name) - initializer = F.initializer.TruncatedNormal(scale=cfg['initializer_range']) - self.classifier = _build_linear(cfg['hidden_size'], cfg['num_labels'], append_name(name, 'cls'), initializer) + initializer = nn.initializer.TruncatedNormal( + std=cfg['initializer_range']) + self.classifier = _build_linear(cfg['hidden_size'], cfg['num_labels'], + append_name(name, 'cls'), initializer) prob = cfg.get('classifier_dropout_prob', cfg['hidden_dropout_prob']) - self.dropout = lambda i: L.dropout(i, dropout_prob=prob, dropout_implementation="upscale_in_train",) if self.training else i + self.dropout = nn.Dropout(p=prob) + self.train() @add_docstring(ErnieModel.forward.__doc__) def forward(self, *args, **kwargs): """ Args: - labels (optional, `Variable` of shape [batch_size, seq_len]): + labels (optional, `Variable` of shape [batch_size, seq_len]): ground truth label id for each token Returns: loss (`Variable` of shape []): @@ -433,17 +511,19 @@ class ErnieModelForTokenClassification(ErnieModel): ignore_index = kwargs.pop('ignore_index', -100) labels = kwargs.pop('labels', None) loss_weights = kwargs.pop('loss_weights', None) - pooled, encoded = super(ErnieModelForTokenClassification, self).forward(*args, **kwargs) - hidden = self.dropout(encoded) # maybe not? + pooled, encoded = super(ErnieModelForTokenClassification, + self).forward(*args, **kwargs) + hidden = self.dropout(encoded) # maybe not? logits = self.classifier(hidden) if labels is not None: - if len(labels.shape) == 2: - labels = L.unsqueeze(labels, axes=[-1]) - loss = L.softmax_with_cross_entropy(logits, labels, ignore_index=ignore_index) + if len(labels.shape) != 2: + labels = labels.squeeze() + loss = F.cross_entropy( + logits, labels, ignore_index=ignore_index, reduction='none') if loss_weights is not None: - loss = L.squeeze(loss, [-1]) * loss_weights - loss = L.reduce_mean(loss) + loss = loss * loss_weights + loss = loss.mean() else: loss = None return loss, logits @@ -453,22 +533,27 @@ class ErnieModelForQuestionAnswering(ErnieModel): """ Ernie model for reading comprehension tasks (SQuAD) """ + def __init__(self, cfg, name=None): super(ErnieModelForQuestionAnswering, self).__init__(cfg, name=name) - initializer = F.initializer.TruncatedNormal(scale=cfg['initializer_range']) - self.classifier = _build_linear(cfg['hidden_size'], 2, append_name(name, 'cls_mrc'), initializer) + initializer = nn.initializer.TruncatedNormal( + std=cfg['initializer_range']) + self.classifier = _build_linear(cfg['hidden_size'], 2, + append_name(name, 'cls_mrc'), + initializer) prob = cfg.get('classifier_dropout_prob', cfg['hidden_dropout_prob']) - self.dropout = lambda i: L.dropout(i, dropout_prob=prob, dropout_implementation="upscale_in_train",) if self.training else i + self.dropout = nn.Dropout(p=prob) + self.train() @add_docstring(ErnieModel.forward.__doc__) def forward(self, *args, **kwargs): """ Args: - start_pos (optional, `Variable` of shape [batch_size]): + start_pos (optional, `Variable` of shape [batch_size]): token index of start of answer span in `context` - end_pos (optional, `Variable` of shape [batch_size]): + end_pos (optional, `Variable` of shape [batch_size]): token index of end of answer span in `context` Returns: loss (`Variable` of shape []): @@ -482,35 +567,38 @@ class ErnieModelForQuestionAnswering(ErnieModel): start_pos = kwargs.pop('start_pos', None) end_pos = kwargs.pop('end_pos', None) - pooled, encoded = super(ErnieModelForQuestionAnswering, self).forward(*args, **kwargs) + pooled, encoded = super(ErnieModelForQuestionAnswering, self).forward( + *args, **kwargs) encoded = self.dropout(encoded) encoded = self.classifier(encoded) - start_logit, end_logits = L.unstack(encoded, axis=-1) + start_logit, end_logits = P.unstack(encoded, axis=-1) if start_pos is not None and end_pos is not None: - if len(start_pos.shape) == 1: - start_pos = L.unsqueeze(start_pos, axes=[-1]) - if len(end_pos.shape) == 1: - end_pos = L.unsqueeze(end_pos, axes=[-1]) - start_loss = L.softmax_with_cross_entropy(start_logit, start_pos) - end_loss = L.softmax_with_cross_entropy(end_logits, end_pos) - loss = (L.reduce_mean(start_loss) + L.reduce_mean(end_loss)) / 2. + if len(start_pos.shape) != 1: + start_pos = start_pos.squeeze() + if len(end_pos.shape) != 1: + end_pos = end_pos.squeeze() + start_loss = F.cross_entropy(start_logit, start_pos) + end_loss = F.cross_entropy(end_logits, end_pos) + loss = (start_loss.mean() + end_loss.mean()) / 2. else: loss = None return loss, start_logit, end_logits -class NSPHead(D.Layer): +class NSPHead(nn.Layer): def __init__(self, cfg, name=None): super(NSPHead, self).__init__() - initializer = F.initializer.TruncatedNormal(scale=cfg['initializer_range']) - self.nsp = _build_linear(cfg['hidden_size'], 2, append_name(name, 'nsp_fc'), initializer) + initializer = nn.initializer.TruncatedNormal( + std=cfg['initializer_range']) + self.nsp = _build_linear(cfg['hidden_size'], 2, + append_name(name, 'nsp_fc'), initializer) def forward(self, inputs, labels): """ Args: - start_pos (optional, `Variable` of shape [batch_size]): + start_pos (optional, `Variable` of shape [batch_size]): token index of start of answer span in `context` - end_pos (optional, `Variable` of shape [batch_size]): + end_pos (optional, `Variable` of shape [batch_size]): token index of end of answer span in `context` Returns: loss (`Variable` of shape []): @@ -523,8 +611,7 @@ class NSPHead(D.Layer): """ logits = self.nsp(inputs) - loss = L.softmax_with_cross_entropy(logits, labels) - loss = L.reduce_mean(loss) + loss = F.cross_entropy(logits, labels) return loss @@ -532,34 +619,41 @@ class ErnieModelForPretraining(ErnieModel): """ Ernie Model for Masked Languate Model pretrain """ + def __init__(self, cfg, name=None): super(ErnieModelForPretraining, self).__init__(cfg, name=name) - initializer = F.initializer.TruncatedNormal(scale=cfg['initializer_range']) + initializer = nn.initializer.TruncatedNormal( + std=cfg['initializer_range']) d_model = cfg['hidden_size'] d_vocab = cfg['vocab_size'] - self.pooler_heads = D.LayerList([NSPHead(cfg, name=name)]) - self.mlm = _build_linear(d_model, d_model, append_name(name, 'mask_lm_trans_fc'), initializer, act=cfg['hidden_act']) - self.mlm_ln = _build_ln(d_model, name = append_name(name, 'mask_lm_trans')) - self.mlm_bias = L.create_parameter( - dtype='float32', - shape=[d_vocab], - attr=F.ParamAttr( - name=append_name(name, 'mask_lm_out_fc.b_0'), - initializer=F.initializer.Constant(value=0.0) - ), - is_bias=True, - ) + self.pooler_heads = nn.LayerList([NSPHead(cfg, name=name)]) + self.mlm = _build_linear( + d_model, + d_model, + append_name(name, 'mask_lm_trans_fc'), + initializer, ) + self.act = ACT_DICT[cfg['hidden_act']]() + self.mlm_ln = _build_ln( + d_model, name=append_name(name, 'mask_lm_trans')) + self.mlm_bias = P.create_parameter( + dtype='float32', + shape=[d_vocab], + attr=P.ParamAttr( + name=append_name(name, 'mask_lm_out_fc.b_0'), + initializer=nn.initializer.Constant(value=0.0)), + is_bias=True, ) + self.train() @add_docstring(ErnieModel.forward.__doc__) def forward(self, *args, **kwargs): """ Args: - nsp_labels (optional, `Variable` of shape [batch_size]): + nsp_labels (optional, `Variable` of shape [batch_size]): labels for `next sentence prediction` tasks - mlm_pos (optional, `Variable` of shape [n_mask, 2]): + mlm_pos (optional, `Variable` of shape [n_mask, 2]): index of mask_id in `src_ids`, can be obtained from `fluid.layers.where(src_ids==mask_id)` - labels (optional, `Variable` of shape [n_mask]): + labels (optional, `Variable` of shape [n_mask]): labels for `mask language model` tasks, the original token indices in masked position in `src_ids` Returns: loss (`Variable` of shape []): @@ -573,19 +667,21 @@ class ErnieModelForPretraining(ErnieModel): mlm_labels = kwargs.pop('labels') mlm_pos = kwargs.pop('mlm_pos') nsp_labels = kwargs.pop('nsp_labels') - pooled, encoded = super(ErnieModelForPretraining, self).forward(*args, **kwargs) - if len(mlm_labels.shape) == 1: - mlm_labels = L.reshape(mlm_labels, [-1, 1]) + pooled, encoded = super(ErnieModelForPretraining, self).forward( + *args, **kwargs) + if len(mlm_labels.shape) != 1: + mlm_labels = mlm_labels.squeeze() if len(nsp_labels.shape) == 1: - nsp_labels = L.reshape(nsp_labels, [-1, 1]) + nsp_labels = nsp_labels.squeeze() nsp_loss = self.pooler_heads[0](pooled, nsp_labels) - encoded_2d = L.gather_nd(encoded, mlm_pos) - encoded_2d = self.mlm(encoded_2d) + encoded_2d = encoded.gather_nd(mlm_pos) + encoded_2d = self.act(self.mlm(encoded_2d)) encoded_2d = self.mlm_ln(encoded_2d) - logits_2d = L.matmul(encoded_2d, self.word_emb.weight, transpose_y=True) + self.mlm_bias - mlm_loss = L.reduce_mean(L.softmax_with_cross_entropy(logits_2d, mlm_labels)) + logits_2d = encoded_2d.matmul( + self.word_emb.weight, transpose_y=True) + self.mlm_bias + mlm_loss = F.cross_entropy(logits_2d, mlm_labels) total_loss = mlm_loss + nsp_loss return total_loss, mlm_loss, nsp_loss @@ -595,30 +691,40 @@ class ErnieModelForGeneration(ErnieModel): Ernie Model for sequence to sequence generation. """ resource_map = { - 'ernie-gen-base-en': ErnieModel.bce + 'model-ernie-gen-base-en.1.tar.gz', - 'ernie-gen-large-en': ErnieModel.bce + 'model-ernie-gen-large-en.1.tar.gz', - 'ernie-gen-large-430g-en': ErnieModel.bce + 'model-ernie-gen-large-430g-en.1.tar.gz', + 'ernie-gen-base-en': + ErnieModel.bce + 'model-ernie-gen-base-en.1.tar.gz', + 'ernie-gen-large-en': + ErnieModel.bce + 'model-ernie-gen-large-en.1.tar.gz', + 'ernie-gen-large-430g-en': + ErnieModel.bce + 'model-ernie-gen-large-430g-en.1.tar.gz', 'ernie-1.0': ErnieModel.bce + 'model-ernie1.0.1.tar.gz', } + def __init__(self, cfg, name=None): cfg['return_additional_info'] = True cfg['has_pooler'] = False super(ErnieModelForGeneration, self).__init__(cfg, name=name) - initializer = F.initializer.TruncatedNormal(scale=cfg['initializer_range']) + initializer = nn.initializer.TruncatedNormal( + std=cfg['initializer_range']) d_model = cfg['hidden_size'] d_vocab = cfg['vocab_size'] - self.mlm = _build_linear(d_model, d_model, append_name(name, 'mask_lm_trans_fc'), initializer, act=cfg['hidden_act']) - self.mlm_ln = _build_ln(d_model, name = append_name(name, 'mask_lm_trans')) - self.mlm_bias = L.create_parameter( - dtype='float32', - shape=[d_vocab], - attr=F.ParamAttr( - name=append_name(name, 'mask_lm_out_fc.b_0'), - initializer=F.initializer.Constant(value=0.0) - ), - is_bias=True, - ) + self.mlm = _build_linear( + d_model, + d_model, + append_name(name, 'mask_lm_trans_fc'), + initializer, ) + self.act = ACT_DICT[cfg['hidden_act']]() + self.mlm_ln = _build_ln( + d_model, name=append_name(name, 'mask_lm_trans')) + self.mlm_bias = P.create_parameter( + dtype='float32', + shape=[d_vocab], + attr=P.ParamAttr( + name=append_name(name, 'mask_lm_out_fc.b_0'), + initializer=nn.initializer.Constant(value=0.0)), + is_bias=True, ) + self.train() @add_docstring(ErnieModel.forward.__doc__) def forward(self, *args, **kwargs): @@ -643,23 +749,21 @@ class ErnieModelForGeneration(ErnieModel): _, encoded, info = ErnieModel.forward(self, *args, **kwargs) if encode_only: return None, None, info - elif tgt_labels is None or tgt_pos is None: - encoded = self.mlm(encoded) + if tgt_labels is None or tgt_pos is None: + encoded = self.act(self.mlm(encoded)) encoded = self.mlm_ln(encoded) - logits = L.matmul(encoded, self.word_emb.weight, transpose_y=True) + self.mlm_bias - output_ids = L.argmax(logits, -1) + logits = encoded.matmul( + self.word_emb.weight, transpose_y=True) + self.mlm_bias + output_ids = logits.cast('float32').argmax(-1) return output_ids, logits, info else: - encoded_2d = L.gather_nd(encoded, tgt_pos) - encoded_2d = self.mlm(encoded_2d) + encoded_2d = encoded.gather_nd(tgt_pos) + encoded_2d = self.act(self.mlm(encoded_2d)) encoded_2d = self.mlm_ln(encoded_2d) - logits_2d = L.matmul(encoded_2d, self.word_emb.weight, transpose_y=True) + self.mlm_bias - if len(tgt_labels.shape) == 1: - tgt_labels = L.reshape(tgt_labels, [-1, 1]) - - loss = L.reduce_mean( - L.softmax_with_cross_entropy(logits_2d, tgt_labels, soft_label=(tgt_labels.shape[-1] != 1)) - ) - return loss, logits_2d, info - + logits_2d = encoded_2d.matmul( + self.word_emb.weight, transpose_y=True) + self.mlm_bias + assert len( + tgt_labels.shape) == 2, 'expect 2d label, got %r' % tgt_labels + loss = F.cross_entropy(logits_2d, tgt_labels, soft_label=True) + return loss, logits_2d, info diff --git a/ernie/optimization.py b/ernie/optimization.py deleted file mode 100644 index da916e4..0000000 --- a/ernie/optimization.py +++ /dev/null @@ -1,203 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals -from __future__ import absolute_import - -import logging -import re - -import numpy as np -import paddle.fluid as F -import paddle.fluid.layers as L -import paddle.fluid.dygraph as D - -log = logging.getLogger(__name__) - -def linear_warmup_decay(learning_rate, warmup_steps, num_train_steps): - """ Applies linear warmup of learning rate from 0 and decay to 0.""" - with F.default_main_program()._lr_schedule_guard(): - lr = L.tensor.create_global_var( - shape=[1], - value=0.0, - dtype='float32', - persistable=True, - name="scheduled_learning_rate") - - global_step = L.learning_rate_scheduler._decay_step_counter() - - warmup_lr = learning_rate * (global_step / warmup_steps) - - poly_decay_lr = L.learning_rate_scheduler.polynomial_decay( - learning_rate=learning_rate, - decay_steps=num_train_steps, - end_learning_rate=0.0, - power=1.0, - cycle=False) -# - decayed_lr = L.elementwise_min(warmup_lr, poly_decay_lr) - L.assign(decayed_lr, lr) - return lr - - -def optimization(loss, - warmup_steps, - num_train_steps, - learning_rate, - train_program, - startup_prog, - weight_decay, - scheduler='linear_warmup_decay', - use_fp16=False, - init_loss_scaling=128, - incr_every_n_steps=1000, - decr_every_n_nan_or_inf=2, - incr_ratio=2.0, - decr_ratio=0.8): - """do backword for static""" - - def exclude_from_weight_decay(param): - name = param.name.rstrip('.master') - if name.find("layer_norm") > -1: - return True - bias_suffix = ["_bias", "_b", ".b_0"] - for suffix in bias_suffix: - if name.endswith(suffix): - return True - return False - - if warmup_steps > 0: - if scheduler == 'noam_decay': - scheduled_lr = L.learning_rate_scheduler\ - .noam_decay(1/(warmup_steps *(learning_rate ** 2)), - warmup_steps) - elif scheduler == 'linear_warmup_decay': - scheduled_lr = linear_warmup_decay(learning_rate, warmup_steps, - num_train_steps) - else: - raise ValueError("Unkown learning rate scheduler, should be " - "'noam_decay' or 'linear_warmup_decay'") - log.debug('using Adam') - optimizer = F.optimizer.Adam(learning_rate=scheduled_lr) - else: - scheduled_lr = L.create_global_var( - name=F.unique_name.generate("learning_rate"), - shape=[1], - value=learning_rate, - dtype='float32', - persistable=True) - log.debug('using Adam') - - optimizer = F.optimizer.Adam(learning_rate=scheduled_lr) - optimizer._learning_rate_map[F.default_main_program( - )] = scheduled_lr - - if use_fp16: - log.info('AMP activated') - optimizer = F.contrib.mixed_precision.decorate(optimizer, - amp_lists=F.contrib.mixed_precision.AutoMixedPrecisionLists(custom_black_varnames={"loss"}, custom_black_list={'layer_norm', 'arg_max', 'argmax'}), - init_loss_scaling=init_loss_scaling, - use_dynamic_loss_scaling=True, - ) - loss_scaling = optimizer.get_loss_scaling() - else: - loss_scaling = None - - F.clip.set_gradient_clip( - clip=F.clip.GradientClipByGlobalNorm(clip_norm=1.0)) - - param_list = {} - - for param in train_program.global_block().all_parameters(): - param_list[param.name] = param * 1.0 - param_list[param.name].stop_gradient = True - - _, param_grads = optimizer.minimize(loss) - - if weight_decay > 0: - for param, grad in param_grads: - if exclude_from_weight_decay(param): - continue - with param.block.program._optimized_guard( - [param, grad]), F.framework.name_scope("weight_decay"): - updated_param = param - param_list[ - param.name] * weight_decay * scheduled_lr - L.assign(output=param, input=updated_param) - - return scheduled_lr, loss_scaling - - -class AdamW(F.optimizer.AdamOptimizer): - """AdamW object for dygraph""" - def __init__(self, *args, **kwargs): - weight_decay = kwargs.pop('weight_decay', None) - var_name_to_exclude = kwargs.pop('var_name_to_exclude', '.*layer_norm_scale|.*layer_norm_bias|.*b_0') - super(AdamW, self).__init__(*args, **kwargs) - self.wd = weight_decay - self.pat = re.compile(var_name_to_exclude) - - def apply_optimize(self, loss, startup_program, params_grads): - super(AdamW, self).apply_optimize(loss, startup_program, params_grads) - for p, g in params_grads: - #log.debug(L.reduce_mean(p)) - if not self.pat.match(p.name): - L.assign(p * (1. - self.wd * self.current_step_lr()), p) - #log.debug(L.reduce_mean(p)) - - -class LinearDecay(D.learning_rate_scheduler.LearningRateDecay): - def __init__(self, - learning_rate, - warmup_steps, - decay_steps, - end_learning_rate=0, - power=1.0, - cycle=False, - begin=0, - step=1, - dtype='float32'): - super(LinearDecay, self).__init__(begin, step, dtype) - self.learning_rate = learning_rate - self.warmup_steps = warmup_steps - self.decay_steps = decay_steps - self.end_learning_rate = end_learning_rate - self.power = power - self.cycle = cycle - - def step(self): - if self.step_num < self.warmup_steps: - decayed_lr = self.learning_rate * (self.step_num / - self.warmup_steps) - decayed_lr = self.create_lr_var(decayed_lr) - else: - tmp_step_num = self.step_num - tmp_decay_steps = self.decay_steps - if self.cycle: - div_res = fluid.layers.ceil( - self.create_lr_var(tmp_step_num / float(self.decay_steps))) - if tmp_step_num == 0: - div_res = self.create_lr_var(1.0) - tmp_decay_steps = self.decay_steps * div_res - else: - tmp_step_num = self.create_lr_var( - tmp_step_num - if tmp_step_num < self.decay_steps else self.decay_steps) - decayed_lr = (self.learning_rate - self.end_learning_rate) * \ - ((1 - tmp_step_num / tmp_decay_steps) ** self.power) + self.end_learning_rate - - return decayed_lr - diff --git a/ernie/tokenizing_ernie.py b/ernie/tokenizing_ernie.py index 2733767..ffcefd2 100644 --- a/ernie/tokenizing_ernie.py +++ b/ernie/tokenizing_ernie.py @@ -23,6 +23,7 @@ import six import re import logging import tempfile +from pathlib import Path from functools import partial if six.PY2: from pathlib2 import Path @@ -41,6 +42,7 @@ log = logging.getLogger(__name__) _max_input_chars_per_word = 100 + def _wordpiece(token, vocab, unk_token, prefix='##', sentencepiece_prefix=''): """ wordpiece: helloworld => [hello, ##world] """ chars = list(token) @@ -81,43 +83,56 @@ class ErnieTokenizer(object): resource_map = { 'ernie-1.0': bce + 'model-ernie1.0.1.tar.gz', 'ernie-2.0-en': bce + 'model-ernie2.0-en.1.tar.gz', - 'ernie-2.0-large-en': bce + 'model-ernie2.0-large-en.1.tar.gz', + 'ernie-2.0-large-en': bce + 'model-ernie2.0-large-en.1.tar.gz', 'ernie-tiny': bce + 'model-ernie_tiny.1.tar.gz', 'ernie-gen-base-en': bce + 'model-ernie-gen-base-en.1.tar.gz', 'ernie-gen-large-en': bce + 'model-ernie-gen-large-en.1.tar.gz', } + @classmethod - def from_pretrained(cls, pretrain_dir_or_url, force_download=False, **kwargs): + def from_pretrained(cls, + pretrain_dir_or_url, + force_download=False, + **kwargs): if pretrain_dir_or_url in cls.resource_map: url = cls.resource_map[pretrain_dir_or_url] log.info('get pretrain dir from %s' % url) - pretrain_dir = _fetch_from_remote(url, force_download=force_download) + pretrain_dir = _fetch_from_remote( + url, force_download=force_download) else: - log.info('pretrain dir %s not in %s, read from local' % (pretrain_dir_or_url, repr(cls.resource_map))) - pretrain_dir = Path(pretrain_dir_or_url) + log.info('pretrain dir %s not in %s, read from local' % + (pretrain_dir_or_url, repr(cls.resource_map))) + pretrain_dir = pretrain_dir_or_url + pretrain_dir = Path(pretrain_dir) if not pretrain_dir.exists(): raise ValueError('pretrain dir not found: %s' % pretrain_dir) vocab_path = pretrain_dir / 'vocab.txt' if not vocab_path.exists(): - raise ValueError('no vocab file in pretrain dir: %s' % pretrain_dir) - vocab_dict = {j.strip().split('\t')[0]: i for i, j in enumerate(vocab_path.open(encoding='utf8').readlines())} + raise ValueError('no vocab file in pretrain dir: %s' % + pretrain_dir) + vocab_dict = { + j.strip().split('\t')[0]: i + for i, j in enumerate( + vocab_path.open(encoding='utf8').readlines()) + } t = cls(vocab_dict, **kwargs) return t - def __init__(self, - vocab, - unk_token='[UNK]', - sep_token='[SEP]', - cls_token='[CLS]', - pad_token='[PAD]', - mask_token='[MASK]', - wordpiece_prefix='##', - sentencepiece_prefix='', - lower=True, - encoding='utf8', - special_token_list=[]): + def __init__(self, + vocab, + unk_token='[UNK]', + sep_token='[SEP]', + cls_token='[CLS]', + pad_token='[PAD]', + mask_token='[MASK]', + wordpiece_prefix='##', + sentencepiece_prefix='', + lower=True, + encoding='utf8', + special_token_list=[]): if not isinstance(vocab, dict): - raise ValueError('expect `vocab` to be instance of dict, got %s' % type(vocab)) + raise ValueError('expect `vocab` to be instance of dict, got %s' % + type(vocab)) self.vocab = vocab self.lower = lower self.prefix = wordpiece_prefix @@ -128,7 +143,9 @@ class ErnieTokenizer(object): self.unk_id = unk_token and self.vocab[unk_token] self.mask_id = mask_token and self.vocab[mask_token] self.unk_token = unk_token - special_tokens = {pad_token, cls_token, sep_token, unk_token, mask_token} | set(special_token_list) + special_tokens = { + pad_token, cls_token, sep_token, unk_token, mask_token + } | set(special_token_list) pat_str = '' for t in special_tokens: if t is None: @@ -146,14 +163,19 @@ class ErnieTokenizer(object): text = text.decode(self.encoding) if six.PY2 and isinstance(text, str): text = text.decode(self.encoding) - + res = [] for match in self.pat.finditer(text): match_group = match.group(0) if match.groups()[-1]: if self.lower: match_group = match_group.lower() - words, _ = _wordpiece(match_group, vocab=self.vocab, unk_token=self.unk_token, prefix=self.prefix, sentencepiece_prefix=self.sentencepiece_prefix) + words, _ = _wordpiece( + match_group, + vocab=self.vocab, + unk_token=self.unk_token, + prefix=self.prefix, + sentencepiece_prefix=self.sentencepiece_prefix) else: words = [match_group] res += words @@ -167,10 +189,12 @@ class ErnieTokenizer(object): len2 = len(id2) half = seqlen // 2 if len1 > len2: - len1_truncated, len2_truncated = max(half, seqlen - len2), min(half, len2) + len1_truncated, len2_truncated = max(half, seqlen - len2), min( + half, len2) else: - len1_truncated, len2_truncated = min(half, seqlen - len1), max(half, seqlen - len1) - return id1[: len1_truncated], id2[: len2_truncated] + len1_truncated, len2_truncated = min(half, seqlen - len1), max( + half, seqlen - len1) + return id1[:len1_truncated], id2[:len2_truncated] def build_for_ernie(self, text_id, pair_id=[]): """build sentence type id, add [CLS] [SEP]""" @@ -185,25 +209,32 @@ class ErnieTokenizer(object): return ret_id, ret_id_type def encode(self, text, pair=None, truncate_to=None): - text_id = np.array(self.convert_tokens_to_ids(self.tokenize(text)), dtype=np.int64) + text_id = np.array( + self.convert_tokens_to_ids(self.tokenize(text)), dtype=np.int64) text_id_type = np.zeros_like(text_id, dtype=np.int64) if pair is not None: - pair_id = np.array(self.convert_tokens_to_ids(self.tokenize(pair)), dtype=np.int64) + pair_id = np.array( + self.convert_tokens_to_ids(self.tokenize(pair)), + dtype=np.int64) else: pair_id = [] if truncate_to is not None: - text_id, pair_id = self.truncate(text_id, [] if pair_id is None else pair_id, truncate_to) + text_id, pair_id = self.truncate(text_id, [] if pair_id is None + else pair_id, truncate_to) ret_id, ret_id_type = self.build_for_ernie(text_id, pair_id) return ret_id, ret_id_type - class ErnieTinyTokenizer(ErnieTokenizer): bce = 'https://ernie-github.cdn.bcebos.com/' resource_map = {'ernie-tiny': bce + 'model-ernie_tiny.1.tar.gz'} + @classmethod - def from_pretrained(cls, pretrain_dir_or_url, force_download=False, **kwargs): + def from_pretrained(cls, + pretrain_dir_or_url, + force_download=False, + **kwargs): if pretrain_dir_or_url in cls.resource_map: url = cls.resource_map[pretrain_dir_or_url] log.info('get pretrain dir from %s' % url) @@ -217,8 +248,13 @@ class ErnieTinyTokenizer(ErnieTokenizer): sp_model_path = pretrain_dir / 'subword/spm_cased_simp_sampled.model' if not vocab_path.exists(): - raise ValueError('no vocab file in pretrain dir: %s' % pretrain_dir) - vocab_dict = {j.strip().split('\t')[0]: i for i, j in enumerate(vocab_path.open(encoding='utf8').readlines())} + raise ValueError('no vocab file in pretrain dir: %s' % + pretrain_dir) + vocab_dict = { + j.strip().split('\t')[0]: i + for i, j in enumerate( + vocab_path.open(encoding='utf8').readlines()) + } t = cls(vocab_dict, sp_model_path, **kwargs) return t @@ -247,4 +283,3 @@ class ErnieTinyTokenizer(ErnieTokenizer): for match in self.cut(text): res += self.sp_model.EncodeAsPieces(match) return res - diff --git a/experimental/seq2seq/README.md b/experimental/seq2seq/README.md deleted file mode 120000 index a04e908..0000000 --- a/experimental/seq2seq/README.md +++ /dev/null @@ -1 +0,0 @@ -../../demo/seq2seq/README.md \ No newline at end of file diff --git a/experimental/seq2seq/README.md b/experimental/seq2seq/README.md new file mode 100644 index 0000000..608e5c0 --- /dev/null +++ b/experimental/seq2seq/README.md @@ -0,0 +1,59 @@ +# ERNIE-GEN + +[ERNIE-GEN](https://arxiv.org/pdf/2001.11314.pdf) is a multi-flow language generation framework for both pre-training and fine-tuning. +Only finetune strategy is illustrated in this section. + +## Finetune + +We use Abstractive Summarization task CNN/DailyMail to illustate usage of ERNIE-GEN, you can download preprocessed finetune data from [here](https://ernie-github.cdn.bcebos.com/data-cnndm.tar.gz) + +To starts finetuning ERNIE-GEN, run: + +```script +python3 -m paddle.distributed.launch \ + --log_dir ./log \ + ./demo/seq2seq/finetune_seq2seq_dygraph.py \ + --from_pretrained ernie-gen-base-en \ + --data_dir ./data/cnndm \ + --save_dir ./model_cnndm \ + --label_smooth 0.1 \ + --use_random_noice \ + --noise_prob 0.7 \ + --predict_output_dir ./pred \ + --max_steps $((287113*30/64)) +``` + +Note that you need more than 2 GPUs to run the finetuning. +During multi-gpu finetuning, `max_steps` is used as stop criteria rather than `epoch` to prevent dead block. +We simply canculate `max_steps` with: `EPOCH * NUM_TRIAN_EXAMPLE / TOTAL_BATCH`. +This demo script will save a finetuned model at `--save_dir`, and do muti-gpu prediction every `--eval_steps` and save prediction results at `--predict_output_dir`. + + +### Evalution + +While finetuning, a serials of prediction files is generated. +First you need to sort and join all files with: + +```shell +sort -t$'\t' -k1n ./pred/pred.step60000.* |awk -F"\t" '{print $2}'> final_prediction +``` + +then use `./eval_cnndm/cnndm_eval.sh` to calcuate all metrics +(`pyrouge` is required to evalute CNN/Daily Mail.) + +```shell +sh cnndm_eval.sh final_prediction ./data/cnndm/dev.summary +``` + + +### Inference + +To run beam serach decode after you got a finetuned model. try: + +```shell + +cat one_column_source_text| python3 demo/seq2seq/decode.py \ + --from_pretrained ./ernie_gen_large \ + --save_dir ./model_cnndm \ + --bsz 8 +``` diff --git a/inference/README.md b/inference/README.md index 199fa0d..0b97e1e 100644 --- a/inference/README.md +++ b/inference/README.md @@ -47,4 +47,3 @@ make | ----- | ----- | | CPU(Intel(R) Xeon(R) Gold 5117 CPU @ 2.00GHz (20 线程)) | 29.8818| | GPU (P4) | 8.5 | - diff --git a/inference/cpu/CMakeLists.txt b/inference/cpu/CMakeLists.txt index 9856460..3b28203 100644 --- a/inference/cpu/CMakeLists.txt +++ b/inference/cpu/CMakeLists.txt @@ -28,4 +28,3 @@ LINK_DIRECTORIES(${MKLDNN_LIB_PATH}) ADD_EXECUTABLE(inference inference.cc) TARGET_LINK_LIBRARIES(inference dl paddle_fluid glog gflags pthread) - diff --git a/inference/gpu/CMakeLists.txt b/inference/gpu/CMakeLists.txt index 9856460..3b28203 100644 --- a/inference/gpu/CMakeLists.txt +++ b/inference/gpu/CMakeLists.txt @@ -28,4 +28,3 @@ LINK_DIRECTORIES(${MKLDNN_LIB_PATH}) ADD_EXECUTABLE(inference inference.cc) TARGET_LINK_LIBRARIES(inference dl paddle_fluid glog gflags pthread) - diff --git a/patch b/patch new file mode 100644 index 0000000..d3dd74a --- /dev/null +++ b/patch @@ -0,0 +1,35 @@ +diff --git a/demo/finetune_classifier_distributed.py b/demo/finetune_classifier_distributed.py +index 1341f7a..e3df999 100644 +--- a/demo/finetune_classifier_distributed.py ++++ b/demo/finetune_classifier_distributed.py +@@ -65,7 +65,7 @@ parser.add_argument('--lr', type=float, default=5e-5, help='learning rate') + parser.add_argument( + '--save_dir', type=Path, required=True, help='model output directory') + parser.add_argument( +- '--wd', type=int, default=0.01, help='weight decay, aka L2 regularizer') ++ '--wd', type=float, default=0.01, help='weight decay, aka L2 regularizer') + parser.add_argument( + '--init_checkpoint', + type=str, +@@ -110,7 +110,7 @@ def map_fn(seg_a, seg_b, label): + return sentence, segments, label + + train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), +- shuffle=False, repeat=True, use_gz=False, shard=True) \ ++ shuffle=True, repeat=True, use_gz=False, shard=True) \ + .map(map_fn) \ + .padded_batch(args.bsz, (0, 0, 0)) + +diff --git a/propeller/data/functional.py b/propeller/data/functional.py +index 600a139..7c43812 100644 +--- a/propeller/data/functional.py ++++ b/propeller/data/functional.py +@@ -94,7 +94,7 @@ def _cache_shuffle_shard_func(dataset, num_shards, index, seed, drop_last, + len_per_shard = len(data_list) // num_shards + rng = np.random.RandomState(seed) + cnt = 0 +- while cnt < repeat: ++ while cnt != repeat: + cnt += 1 + random.shuffle(data_list, rng.uniform) + diff --git a/propeller/data/__init__.py b/propeller/data/__init__.py index 31701fc..65592dc 100644 --- a/propeller/data/__init__.py +++ b/propeller/data/__init__.py @@ -14,3 +14,5 @@ """ doc """ +from propeller.data.functional import * +from propeller.data.feature_column import * diff --git a/propeller/paddle/data/example.proto b/propeller/data/example.proto similarity index 95% rename from propeller/paddle/data/example.proto rename to propeller/data/example.proto index ba6da96..d8ac146 100644 --- a/propeller/paddle/data/example.proto +++ b/propeller/data/example.proto @@ -16,7 +16,7 @@ // model training or inference. syntax = "proto3"; -import "propeller/paddle/data/feature.proto"; +import "propeller/data/feature.proto"; package propeller; message Example { diff --git a/propeller/paddle/data/example_pb2.py b/propeller/data/example_pb2.py similarity index 76% rename from propeller/paddle/data/example_pb2.py rename to propeller/data/example_pb2.py index 7ea71c7..cfea73a 100644 --- a/propeller/paddle/data/example_pb2.py +++ b/propeller/data/example_pb2.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! -# source: propeller/paddle/data/example.proto +# source: propeller/data/example.proto import sys _b = sys.version_info[0] < 3 and (lambda x: x) or ( @@ -13,19 +13,17 @@ from google.protobuf import symbol_database as _symbol_database _sym_db = _symbol_database.Default() -from propeller.paddle.data import feature_pb2 as propeller_dot_paddle_dot_data_dot_feature__pb2 +from propeller.data import feature_pb2 as propeller_dot_data_dot_feature__pb2 DESCRIPTOR = _descriptor.FileDescriptor( - name='propeller/paddle/data/example.proto', + name='propeller/data/example.proto', package='propeller', syntax='proto3', serialized_options=None, serialized_pb=_b( - '\n#propeller/paddle/data/example.proto\x12\tpropeller\x1a#propeller/paddle/data/feature.proto\"0\n\x07\x45xample\x12%\n\x08\x66\x65\x61tures\x18\x01 \x01(\x0b\x32\x13.propeller.Features\"g\n\x0fSequenceExample\x12$\n\x07\x63ontext\x18\x01 \x01(\x0b\x32\x13.propeller.Features\x12.\n\rfeature_lists\x18\x02 \x01(\x0b\x32\x17.propeller.FeatureListsb\x06proto3' + '\n\x1cpropeller/data/example.proto\x12\tpropeller\x1a\x1cpropeller/data/feature.proto\"0\n\x07\x45xample\x12%\n\x08\x66\x65\x61tures\x18\x01 \x01(\x0b\x32\x13.propeller.Features\"g\n\x0fSequenceExample\x12$\n\x07\x63ontext\x18\x01 \x01(\x0b\x32\x13.propeller.Features\x12.\n\rfeature_lists\x18\x02 \x01(\x0b\x32\x17.propeller.FeatureListsb\x06proto3' ), - dependencies=[ - propeller_dot_paddle_dot_data_dot_feature__pb2.DESCRIPTOR, - ]) + dependencies=[propeller_dot_data_dot_feature__pb2.DESCRIPTOR, ]) _EXAMPLE = _descriptor.Descriptor( name='Example', @@ -60,8 +58,8 @@ _EXAMPLE = _descriptor.Descriptor( syntax='proto3', extension_ranges=[], oneofs=[], - serialized_start=87, - serialized_end=135, ) + serialized_start=73, + serialized_end=121, ) _SEQUENCEEXAMPLE = _descriptor.Descriptor( name='SequenceExample', @@ -113,15 +111,15 @@ _SEQUENCEEXAMPLE = _descriptor.Descriptor( syntax='proto3', extension_ranges=[], oneofs=[], - serialized_start=137, - serialized_end=240, ) + serialized_start=123, + serialized_end=226, ) _EXAMPLE.fields_by_name[ - 'features'].message_type = propeller_dot_paddle_dot_data_dot_feature__pb2._FEATURES + 'features'].message_type = propeller_dot_data_dot_feature__pb2._FEATURES _SEQUENCEEXAMPLE.fields_by_name[ - 'context'].message_type = propeller_dot_paddle_dot_data_dot_feature__pb2._FEATURES + 'context'].message_type = propeller_dot_data_dot_feature__pb2._FEATURES _SEQUENCEEXAMPLE.fields_by_name[ - 'feature_lists'].message_type = propeller_dot_paddle_dot_data_dot_feature__pb2._FEATURELISTS + 'feature_lists'].message_type = propeller_dot_data_dot_feature__pb2._FEATURELISTS DESCRIPTOR.message_types_by_name['Example'] = _EXAMPLE DESCRIPTOR.message_types_by_name['SequenceExample'] = _SEQUENCEEXAMPLE _sym_db.RegisterFileDescriptor(DESCRIPTOR) @@ -131,7 +129,7 @@ Example = _reflection.GeneratedProtocolMessageType( (_message.Message, ), dict( DESCRIPTOR=_EXAMPLE, - __module__='propeller.paddle.data.example_pb2' + __module__='propeller.data.example_pb2' # @@protoc_insertion_point(class_scope:propeller.Example) )) _sym_db.RegisterMessage(Example) @@ -141,7 +139,7 @@ SequenceExample = _reflection.GeneratedProtocolMessageType( (_message.Message, ), dict( DESCRIPTOR=_SEQUENCEEXAMPLE, - __module__='propeller.paddle.data.example_pb2' + __module__='propeller.data.example_pb2' # @@protoc_insertion_point(class_scope:propeller.SequenceExample) )) _sym_db.RegisterMessage(SequenceExample) diff --git a/propeller/paddle/data/feature.proto b/propeller/data/feature.proto similarity index 100% rename from propeller/paddle/data/feature.proto rename to propeller/data/feature.proto diff --git a/propeller/data/feature_column.py b/propeller/data/feature_column.py new file mode 100644 index 0000000..6f1990d --- /dev/null +++ b/propeller/data/feature_column.py @@ -0,0 +1,516 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""FeatureColumns and many Column""" +from __future__ import print_function +from __future__ import absolute_import +from __future__ import unicode_literals + +import os +import sys +import struct +from six.moves import zip, map +import itertools +import gzip +from functools import partial +import six +import logging + +import numpy as np +from glob import glob + +from propeller.data.functional import _interleave_func +from propeller.data.functional import Dataset +from propeller.data import example_pb2, feature_pb2 +import multiprocessing + +log = logging.getLogger(__name__) + +__all__ = [ + 'FeatureColumns', 'TextColumn', 'TextIDColumn', 'LabelColumn', + 'RawBytesColumn', 'basic_tokenizer', 'Column' +] + + +def basic_tokenizer(sen): + """doc""" + seg = sen.split(b' ') + seg = filter(lambda i: i != b' ', seg) + return seg + + +class Column(object): + """doc""" + + def __init__(self, name): + """doc""" + pass + + def raw_to_proto(self, raw): + """doc""" + return feature_pb2.Feature() + + @property + def output_shapes(self): + """doc""" + pass + + @property + def output_types(self): + """doc""" + pass + + def proto_to_instance(self, proto): + """doc""" + raise NotImplementedError() + + def raw_to_instance(self, raw): + """doc""" + raise NotImplementedError() + + +class LabelColumn(Column): + """doc""" + + def __init__(self, name, vocab_dict=None, vocab_file=None): + """doc""" + self.name = name + self.vocab = None + if vocab_file: + self.vocab = { + j.strip(): i + for i, j in enumerate(open(vocab_file, 'rb').readlines()) + } + if vocab_dict: + self.vocab = vocab_dict + + @property + def output_shapes(self): + """doc""" + return [1] + + @property + def output_types(self): + """doc""" + return 'int64' + + def raw_to_proto(self, raw): + """doc""" + if self.vocab is None: + ids = [int(raw)] + else: + ids = [self.vocab[raw]] + fe = feature_pb2.Feature(int64_list=feature_pb2.Int64List(value=ids)) + return fe + + def proto_to_instance(self, feature): + """doc""" + ret = np.array(feature.int64_list.value[0], dtype=np.int64) + return ret + + def raw_to_instance(self, raw): + """doc""" + if self.vocab is None: + ids = int(raw) + else: + ids = self.vocab[raw] + return ids + + +class TextColumn(Column): + """doc""" + + def __init__(self, + name, + unk_id, + vocab_file=None, + vocab_dict=None, + tokenizer=basic_tokenizer): + self.name = name + self.tokenizer = tokenizer + self.unk_id = unk_id + if not (vocab_file or vocab_dict): + raise ValueError('at least specify vocab_file or vocab_dict') + if vocab_file: + self.vocab = { + j.strip(): i + for i, j in enumerate(open(vocab_file, 'rb').readlines()) + } + if vocab_dict: + self.vocab = vocab_dict + + @property + def output_shapes(self): + """doc""" + return [-1] + + @property + def output_types(self): + """doc""" + return 'int64' + + def raw_to_proto(self, raw): + """doc""" + ids = [ + s if isinstance(s, int) else self.vocab.get(s, self.unk_id) + for s in self.tokenizer(raw) + ] + fe = feature_pb2.Feature(int64_list=feature_pb2.Int64List(value=ids)) + return fe + + def proto_to_instance(self, feature): + """doc""" + ret = np.array(feature.int64_list.value, dtype=np.int64) + return ret + + def raw_to_instance(self, raw): + """doc""" + ids = [ + s if isinstance(s, int) else self.vocab.get(s, self.unk_id) + for s in self.tokenizer(raw) + ] + return np.array(ids, dtype=np.int64) + + +class RawBytesColumn(Column): + def __init__(self, name): + self.name = name + + @property + def output_shapes(self): + """doc""" + return [-1] + + @property + def output_types(self): + """doc""" + return 'bytes' + + def raw_to_proto(self, raw): + """doc""" + fe = feature_pb2.Feature(bytes_list=feature_pb2.BytesList(value=[raw])) + return fe + + def proto_to_instance(self, feature): + """doc""" + ret = feature.bytes_list.value[ + 0] # np.array(feature.int64_list.value, dtype=np.int64) + return ret + + def raw_to_instance(self, raw): + """doc""" + return raw + + +class TextIDColumn(Column): + """doc""" + + def __init__(self, name): + """doc""" + self.name = name + + @property + def output_shapes(self): + """doc""" + return [-1] + + @property + def output_types(self): + """doc""" + return 'int64' + + def raw_to_proto(self, raw): + """doc""" + ids = [int(s) for s in raw.split(b' ')] + fe = feature_pb2.Feature(int64_list=feature_pb2.Int64List(value=ids)) + return fe + + def proto_to_instance(self, feature): + """doc""" + ret = np.array(feature.int64_list.value, dtype=np.int64) + return ret + + def raw_to_instance(self, raw): + """doc""" + ret = np.array([int(i) for i in raw.split(b' ')], dtype=np.int64) + return ret + + +def _list_files(raw_dir): + return [os.path.join(raw_dir, p) for p in os.listdir(raw_dir)] + + +class FeatureColumns(object): + """A Dataset Factory object""" + + def __init__(self, columns): + """doc""" + self._columns = columns + + def _make_gz_dataset(self, raw_dir, gz_dir): + assert raw_dir or gz_dir, 'data_dir not specified when using gz mode' + if raw_dir is not None: + assert os.path.exists(raw_dir), 'raw_dir not exists: %s' % raw_dir + raw_file = os.listdir(raw_dir) + if gz_dir is None: + gz_dir = '%s_gz' % raw_dir.rstrip('/') + + if not os.path.exists(gz_dir): + os.mkdir(gz_dir) + + if raw_dir is not None: + if len(raw_file) != 0: + log.debug('try making gz') + pool = multiprocessing.Pool() + args = [(os.path.join(raw_dir, f), os.path.join(gz_dir, f), + self._columns, b'\t') for f in raw_file] + pool.map(_make_gz, args) + pool.close() + pool.join() + else: + assert len( + os.listdir(gz_dir) + ) != 0, 'cant find gz file or raw-txt file at [%s] and [%s]' % ( + raw_dir, gz_dir) + return gz_dir + + def _read_gz_dataset(self, + gz_files, + shuffle=False, + repeat=False, + shard=False, + **kwargs): + if len(gz_files) == 0: + raise ValueError('reading gz from empty file list: %s' % gz_files) + log.info('reading gz from %s' % '\n'.join(gz_files)) + dataset = Dataset.from_list(gz_files) + + if shuffle: + dataset = dataset.shuffle(buffer_size=len(gz_files)) + fn = partial( + _interleave_func, + map_fn=lambda filename: Dataset.from_record_file(filename), + cycle_length=len(gz_files), + block_length=1) + dataset = dataset.apply(fn) + + seed = kwargs.pop('seed', 0) + if shard: + from propeller.paddle.train import distribution + if shuffle: + if distribution.status.mode == distribution.DistributionMode.NCCL: + dataset = dataset.cache_shuffle_shard( + distribution.status.num_replica, + distribution.status.replica_id, + seed=seed, + drop_last=True) + else: + dataset = dataset.cache_shuffle_shard( + num_shards=1, index=0, seed=seed, drop_last=True) + else: + if distribution.status.mode == distribution.DistributionMode.NCCL: + dataset = dataset.shard(distribution.status.num_replica, + distribution.status.replica_id) + elif shuffle: + dataset = dataset.cache_shuffle_shard( + num_shards=1, index=0, seed=seed, drop_last=True) + + if repeat: + dataset = dataset.repeat() + + def _parse_gz(record_str): # function that takes python_str as input + ex = example_pb2.Example() + ex.ParseFromString(record_str) + ret = [] + fea_dict = ex.features.feature + for c in self._columns: + ins = c.proto_to_instance(fea_dict[c.name]) + ret.append(ins) + return ret + + dataset = dataset.map(_parse_gz) + return dataset + + def _read_txt_dataset(self, + data_files, + shuffle=False, + repeat=False, + shard=False, + **kwargs): + log.info('reading raw files from %s' % '\n'.join(data_files)) + dataset = Dataset.from_list(data_files) + if shuffle: + dataset = dataset.shuffle(buffer_size=len(data_files)) + + fn = partial( + _interleave_func, + map_fn=lambda filename: Dataset.from_file(filename), + cycle_length=len(data_files), + block_length=1) + dataset = dataset.apply(fn) + + seed = kwargs.pop('seed', 0) + if shard: + from propeller.paddle.train import distribution + if shuffle: + if distribution.status.mode == distribution.DistributionMode.NCCL: + dataset = dataset.cache_shuffle_shard( + distribution.status.num_replica, + distribution.status.replica_id, + seed=seed, + repeat=-1 if repeat else 1, + drop_last=True) + else: + dataset = dataset.cache_shuffle_shard( + num_shards=1, + index=0, + seed=seed, + drop_last=True, + repeat=-1 if repeat else 1) + else: + if distribution.status.mode == distribution.DistributionMode.NCCL: + dataset = dataset.shard(distribution.status.num_replica, + distribution.status.replica_id) + if repeat: + dataset.repeat() + elif shuffle: + dataset = dataset.cache_shuffle_shard( + num_shards=1, + index=0, + seed=seed, + drop_last=True, + repeat=-1 if repeat else 1) + elif repeat: + dataset = dataset.repeat() + + def _parse_txt_file( + record_str): # function that takes python_str as input + features = record_str.strip(b'\n').split(b'\t') + ret = [ + column.raw_to_instance(feature) + for feature, column in zip(features, self._columns) + ] + return ret + + dataset = dataset.map(_parse_txt_file) + return dataset + + def _read_stdin_dataset(self, encoding='utf8', shuffle=False, **kwargs): + log.info('reading raw files stdin') + + def _gen(): + if six.PY3: + source = sys.stdin.buffer + else: + source = sys.stdin + while True: + line = source.readline() + if len(line) == 0: + break + yield line, + + dataset = Dataset.from_generator_func(_gen) + if shuffle: + dataset = dataset.shuffle(buffer_size=1000) + + def _parse_stdin(record_str): + """function that takes python_str as input""" + features = record_str.strip(b'\n').split(b'\t') + ret = [ + column.raw_to_instance(feature) + for feature, column in zip(features, self._columns) + ] + return ret + + dataset = dataset.map(_parse_stdin) + return dataset + + def _prepare_dataset(self, + dataset, + map_func_before_batch=None, + map_func_after_batch=None, + shuffle_buffer_size=None, + batch_size=1, + pad_id=0, + prefetch=None, + **kwargs): + + if map_func_before_batch is not None: + dataset = dataset.map(map_func_before_batch) + if batch_size: + dataset = dataset.padded_batch(batch_size, pad_id) + if map_func_after_batch is not None: + dataset = dataset.map(map_func_after_batch) + return dataset + + def build_dataset(self, + name, + use_gz=True, + data_dir=None, + gz_dir=None, + data_file=None, + **kwargs): + """ + build `Dataset` from `data_dir` or `data_file` + if `use_gz`, will try to convert data_files to gz format and save to `gz_dir`, if `gz_dir` not given, will create one. + """ + if use_gz: + gz_dir = self._make_gz_dataset(data_dir, gz_dir) + gz_files = _list_files(gz_dir) if gz_dir is not None else gz_dir + ds = self._read_gz_dataset(gz_files, **kwargs) + else: + if data_dir is not None: + data_files = _list_files(data_dir) + elif data_file is not None: + data_files = [data_file] + else: + raise ValueError('data_dir or data_files not specified') + ds = self._read_txt_dataset(data_files, **kwargs) + ds.name = name + return ds + + def build_dataset_from_stdin(self, name, **kwargs): + """doc""" + ds = self._read_stdin_dataset(**kwargs) + ds.name = name + return ds + + +def _make_gz(args): + try: + from_file, to_file, columns, sep = args + if os.path.exists(to_file): + return + with open(from_file, 'rb') as fin, gzip.open(to_file, 'wb') as fout: + log.debug('making gz %s => %s' % (from_file, to_file)) + for i, line in enumerate(fin): + line = line.strip(b'\n').split(sep) + # if i % 10000 == 0: + # log.debug('making gz %s => %s [%d]' % (from_file, to_file, i)) + if len(line) != len(columns): + log.error('columns not match at %s, got %d, expect %d' % + (from_file, len(line), len(columns))) + continue + features = {} + for l, c in zip(line, columns): + features[c.name] = c.raw_to_proto(l) + example = example_pb2.Example(features=feature_pb2.Features( + feature=features)) + serialized = example.SerializeToString() + l = len(serialized) + data = struct.pack('i%ds' % l, l, serialized) + fout.write(data) + log.debug('done making gz %s => %s' % (from_file, to_file)) + except Exception as e: + log.exception(e) + raise e diff --git a/propeller/paddle/data/feature_pb2.py b/propeller/data/feature_pb2.py similarity index 87% rename from propeller/paddle/data/feature_pb2.py rename to propeller/data/feature_pb2.py index 63ba7c2..4187fdc 100644 --- a/propeller/paddle/data/feature_pb2.py +++ b/propeller/data/feature_pb2.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # Generated by the protocol buffer compiler. DO NOT EDIT! -# source: propeller/paddle/data/feature.proto +# source: propeller/data/feature.proto import sys _b = sys.version_info[0] < 3 and (lambda x: x) or ( @@ -14,12 +14,12 @@ from google.protobuf import symbol_database as _symbol_database _sym_db = _symbol_database.Default() DESCRIPTOR = _descriptor.FileDescriptor( - name='propeller/paddle/data/feature.proto', + name='propeller/data/feature.proto', package='propeller', syntax='proto3', serialized_options=None, serialized_pb=_b( - '\n#propeller/paddle/data/feature.proto\x12\tpropeller\"\x1a\n\tBytesList\x12\r\n\x05value\x18\x01 \x03(\x0c\"\x1e\n\tFloatList\x12\x11\n\x05value\x18\x01 \x03(\x02\x42\x02\x10\x01\"\x1e\n\tInt64List\x12\x11\n\x05value\x18\x01 \x03(\x03\x42\x02\x10\x01\"\x95\x01\n\x07\x46\x65\x61ture\x12*\n\nbytes_list\x18\x01 \x01(\x0b\x32\x14.propeller.BytesListH\x00\x12*\n\nfloat_list\x18\x02 \x01(\x0b\x32\x14.propeller.FloatListH\x00\x12*\n\nint64_list\x18\x03 \x01(\x0b\x32\x14.propeller.Int64ListH\x00\x42\x06\n\x04kind\"\x81\x01\n\x08\x46\x65\x61tures\x12\x31\n\x07\x66\x65\x61ture\x18\x01 \x03(\x0b\x32 .propeller.Features.FeatureEntry\x1a\x42\n\x0c\x46\x65\x61tureEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12!\n\x05value\x18\x02 \x01(\x0b\x32\x12.propeller.Feature:\x02\x38\x01\"2\n\x0b\x46\x65\x61tureList\x12#\n\x07\x66\x65\x61ture\x18\x01 \x03(\x0b\x32\x12.propeller.Feature\"\x9a\x01\n\x0c\x46\x65\x61tureLists\x12>\n\x0c\x66\x65\x61ture_list\x18\x01 \x03(\x0b\x32(.propeller.FeatureLists.FeatureListEntry\x1aJ\n\x10\x46\x65\x61tureListEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.propeller.FeatureList:\x02\x38\x01\x62\x06proto3' + '\n\x1cpropeller/data/feature.proto\x12\tpropeller\"\x1a\n\tBytesList\x12\r\n\x05value\x18\x01 \x03(\x0c\"\x1e\n\tFloatList\x12\x11\n\x05value\x18\x01 \x03(\x02\x42\x02\x10\x01\"\x1e\n\tInt64List\x12\x11\n\x05value\x18\x01 \x03(\x03\x42\x02\x10\x01\"\x95\x01\n\x07\x46\x65\x61ture\x12*\n\nbytes_list\x18\x01 \x01(\x0b\x32\x14.propeller.BytesListH\x00\x12*\n\nfloat_list\x18\x02 \x01(\x0b\x32\x14.propeller.FloatListH\x00\x12*\n\nint64_list\x18\x03 \x01(\x0b\x32\x14.propeller.Int64ListH\x00\x42\x06\n\x04kind\"\x81\x01\n\x08\x46\x65\x61tures\x12\x31\n\x07\x66\x65\x61ture\x18\x01 \x03(\x0b\x32 .propeller.Features.FeatureEntry\x1a\x42\n\x0c\x46\x65\x61tureEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12!\n\x05value\x18\x02 \x01(\x0b\x32\x12.propeller.Feature:\x02\x38\x01\"2\n\x0b\x46\x65\x61tureList\x12#\n\x07\x66\x65\x61ture\x18\x01 \x03(\x0b\x32\x12.propeller.Feature\"\x9a\x01\n\x0c\x46\x65\x61tureLists\x12>\n\x0c\x66\x65\x61ture_list\x18\x01 \x03(\x0b\x32(.propeller.FeatureLists.FeatureListEntry\x1aJ\n\x10\x46\x65\x61tureListEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12%\n\x05value\x18\x02 \x01(\x0b\x32\x16.propeller.FeatureList:\x02\x38\x01\x62\x06proto3' )) _BYTESLIST = _descriptor.Descriptor( @@ -55,8 +55,8 @@ _BYTESLIST = _descriptor.Descriptor( syntax='proto3', extension_ranges=[], oneofs=[], - serialized_start=50, - serialized_end=76, ) + serialized_start=43, + serialized_end=69, ) _FLOATLIST = _descriptor.Descriptor( name='FloatList', @@ -91,8 +91,8 @@ _FLOATLIST = _descriptor.Descriptor( syntax='proto3', extension_ranges=[], oneofs=[], - serialized_start=78, - serialized_end=108, ) + serialized_start=71, + serialized_end=101, ) _INT64LIST = _descriptor.Descriptor( name='Int64List', @@ -127,8 +127,8 @@ _INT64LIST = _descriptor.Descriptor( syntax='proto3', extension_ranges=[], oneofs=[], - serialized_start=110, - serialized_end=140, ) + serialized_start=103, + serialized_end=133, ) _FEATURE = _descriptor.Descriptor( name='Feature', @@ -204,8 +204,8 @@ _FEATURE = _descriptor.Descriptor( containing_type=None, fields=[]), ], - serialized_start=143, - serialized_end=292, ) + serialized_start=136, + serialized_end=285, ) _FEATURES_FEATUREENTRY = _descriptor.Descriptor( name='FeatureEntry', @@ -257,8 +257,8 @@ _FEATURES_FEATUREENTRY = _descriptor.Descriptor( syntax='proto3', extension_ranges=[], oneofs=[], - serialized_start=358, - serialized_end=424, ) + serialized_start=351, + serialized_end=417, ) _FEATURES = _descriptor.Descriptor( name='Features', @@ -293,8 +293,8 @@ _FEATURES = _descriptor.Descriptor( syntax='proto3', extension_ranges=[], oneofs=[], - serialized_start=295, - serialized_end=424, ) + serialized_start=288, + serialized_end=417, ) _FEATURELIST = _descriptor.Descriptor( name='FeatureList', @@ -329,8 +329,8 @@ _FEATURELIST = _descriptor.Descriptor( syntax='proto3', extension_ranges=[], oneofs=[], - serialized_start=426, - serialized_end=476, ) + serialized_start=419, + serialized_end=469, ) _FEATURELISTS_FEATURELISTENTRY = _descriptor.Descriptor( name='FeatureListEntry', @@ -382,8 +382,8 @@ _FEATURELISTS_FEATURELISTENTRY = _descriptor.Descriptor( syntax='proto3', extension_ranges=[], oneofs=[], - serialized_start=559, - serialized_end=633, ) + serialized_start=552, + serialized_end=626, ) _FEATURELISTS = _descriptor.Descriptor( name='FeatureLists', @@ -418,8 +418,8 @@ _FEATURELISTS = _descriptor.Descriptor( syntax='proto3', extension_ranges=[], oneofs=[], - serialized_start=479, - serialized_end=633, ) + serialized_start=472, + serialized_end=626, ) _FEATURE.fields_by_name['bytes_list'].message_type = _BYTESLIST _FEATURE.fields_by_name['float_list'].message_type = _FLOATLIST @@ -459,7 +459,7 @@ BytesList = _reflection.GeneratedProtocolMessageType( (_message.Message, ), dict( DESCRIPTOR=_BYTESLIST, - __module__='propeller.paddle.data.feature_pb2' + __module__='propeller.data.feature_pb2' # @@protoc_insertion_point(class_scope:propeller.BytesList) )) _sym_db.RegisterMessage(BytesList) @@ -469,7 +469,7 @@ FloatList = _reflection.GeneratedProtocolMessageType( (_message.Message, ), dict( DESCRIPTOR=_FLOATLIST, - __module__='propeller.paddle.data.feature_pb2' + __module__='propeller.data.feature_pb2' # @@protoc_insertion_point(class_scope:propeller.FloatList) )) _sym_db.RegisterMessage(FloatList) @@ -479,7 +479,7 @@ Int64List = _reflection.GeneratedProtocolMessageType( (_message.Message, ), dict( DESCRIPTOR=_INT64LIST, - __module__='propeller.paddle.data.feature_pb2' + __module__='propeller.data.feature_pb2' # @@protoc_insertion_point(class_scope:propeller.Int64List) )) _sym_db.RegisterMessage(Int64List) @@ -489,7 +489,7 @@ Feature = _reflection.GeneratedProtocolMessageType( (_message.Message, ), dict( DESCRIPTOR=_FEATURE, - __module__='propeller.paddle.data.feature_pb2' + __module__='propeller.data.feature_pb2' # @@protoc_insertion_point(class_scope:propeller.Feature) )) _sym_db.RegisterMessage(Feature) @@ -503,11 +503,11 @@ Features = _reflection.GeneratedProtocolMessageType( (_message.Message, ), dict( DESCRIPTOR=_FEATURES_FEATUREENTRY, - __module__='propeller.paddle.data.feature_pb2' + __module__='propeller.data.feature_pb2' # @@protoc_insertion_point(class_scope:propeller.Features.FeatureEntry) )), DESCRIPTOR=_FEATURES, - __module__='propeller.paddle.data.feature_pb2' + __module__='propeller.data.feature_pb2' # @@protoc_insertion_point(class_scope:propeller.Features) )) _sym_db.RegisterMessage(Features) @@ -518,7 +518,7 @@ FeatureList = _reflection.GeneratedProtocolMessageType( (_message.Message, ), dict( DESCRIPTOR=_FEATURELIST, - __module__='propeller.paddle.data.feature_pb2' + __module__='propeller.data.feature_pb2' # @@protoc_insertion_point(class_scope:propeller.FeatureList) )) _sym_db.RegisterMessage(FeatureList) @@ -532,11 +532,11 @@ FeatureLists = _reflection.GeneratedProtocolMessageType( (_message.Message, ), dict( DESCRIPTOR=_FEATURELISTS_FEATURELISTENTRY, - __module__='propeller.paddle.data.feature_pb2' + __module__='propeller.data.feature_pb2' # @@protoc_insertion_point(class_scope:propeller.FeatureLists.FeatureListEntry) )), DESCRIPTOR=_FEATURELISTS, - __module__='propeller.paddle.data.feature_pb2' + __module__='propeller.data.feature_pb2' # @@protoc_insertion_point(class_scope:propeller.FeatureLists) )) _sym_db.RegisterMessage(FeatureLists) diff --git a/propeller/data/functional.py b/propeller/data/functional.py index 755500c..7c43812 100644 --- a/propeller/data/functional.py +++ b/propeller/data/functional.py @@ -86,6 +86,35 @@ def _shuffle_func(dataset, buffer_size): return _gen +def _cache_shuffle_shard_func(dataset, num_shards, index, seed, drop_last, + repeat): + def _gen(): + iterable = dataset() + data_list = list(iterable) + len_per_shard = len(data_list) // num_shards + rng = np.random.RandomState(seed) + cnt = 0 + while cnt != repeat: + cnt += 1 + random.shuffle(data_list, rng.uniform) + + iter_data_list = [ + data_list[i] for i in range(index, len(data_list), num_shards) + ] + + if drop_last: + iter_data_list = iter_data_list[:len_per_shard] + else: + fill_start_idx = len(data_list) % num_shards + if 0 < fill_start_idx <= index: + iter_data_list.append(random.choice(data_list)) + + for data in iter_data_list: + yield data + + return _gen + + def _interleave_func(iterable, map_fn, cycle_length, block_length): def _gen(): ls = itertools.tee(iterable(), cycle_length) @@ -93,7 +122,9 @@ def _interleave_func(iterable, map_fn, cycle_length, block_length): for i, j in enumerate(ls): j = itertools.islice(j, i, None, cycle_length) j = map(map_fn, j) + j = (jjj for jj in j for jjj in jj) #flatten + buf.append(j) for tup in six.moves.zip_longest(*buf): @@ -105,11 +136,14 @@ def _interleave_func(iterable, map_fn, cycle_length, block_length): def _repeat_func(dataset, n): def _gen(): - iterable = dataset() + # iterable = dataset() if n >= 0: - ret = itertools.chain(*itertools.tee(iterable, n)) + iters = [] + for i in range(n): + iters.append(dataset()) + ret = itertools.chain(*iters) else: - ret = itertools.cycle(iterable) + ret = itertools.cycle(dataset()) for i in ret: yield i @@ -151,6 +185,20 @@ def _shard_func(dataset, num_shards, index): return _gen +def _chunk_func(dataset, num_shards): + def _gen(): + iterable = dataset() + while True: + ret = list(itertools.islice(iterable, num_shards)) + if len(ret) == num_shards: + for r in ret: + yield r + else: + raise StopIteration + + return _gen + + def _take_func(dataset, count): def _gen(): iterable = dataset() @@ -229,7 +277,11 @@ def _batch_func(dataset, batch_size): return _gen -def _padded_batch_func(dataset, batch_size, pad_value=0, max_seqlen=None): +def _padded_batch_func(dataset, + batch_size, + pad_value=0, + max_seqlen=None, + droplast=False): if not isinstance(batch_size, int): raise ValueError('unknown batch_size: %s' % repr(batch_size)) @@ -238,6 +290,8 @@ def _padded_batch_func(dataset, batch_size, pad_value=0, max_seqlen=None): pad_value_t = pad_value while True: buf = list(itertools.islice(iterable, batch_size)) + if droplast and len(buf) != batch_size: + raise StopIteration if not len(buf): raise StopIteration buf = list(zip(*buf)) # transpose @@ -268,14 +322,50 @@ def _padded_batch_func(dataset, batch_size, pad_value=0, max_seqlen=None): return _gen +def flatten(structure): + flt = [] + + def map_structure(s): + if isinstance(s, np.ndarray): + flt.append(s) + return len(flt) - 1 + elif isinstance(s, list): + return [map_structure(item) for item in s] + elif isinstance(s, tuple): + return tuple([map_structure(item) for item in s]) + elif isinstance(s, dict): + return {key: map_structure(s[key]) for key in sorted(s.keys())} + else: + raise TypeError + + return flt, map_structure(structure) + + +def unflatten(flt, schema): + def map_structure(s): + if isinstance(s, int): + return flt[s] + elif isinstance(s, list): + return [map_structure(item) for item in s] + elif isinstance(s, tuple): + return tuple([map_structure(item) for item in s]) + elif isinstance(s, dict): + return {key: map_structure(s[key]) for key in sorted(s.keys())} + else: + raise TypeError + + return map_structure(schema) + + class Dataset(object): """Python Wrapper for PyReader""" @classmethod def from_generator_func(cls, _gen, data_shapes=None, data_types=None): """doc""" - if not inspect.isgeneratorfunction(_gen): - raise ValueError('expect generator function, got %s' % repr(_gen)) + + #if not inspect.isgeneratorfunction(_gen): + #raise ValueError('expect generator function, got %s' % repr(_gen)) def _wrapper(): #compat to py3.7 try: @@ -340,6 +430,7 @@ class Dataset(object): self.name = None self._data_shapes = None self._data_types = None + self._data_schema = None self.generator = None self.pyreader = None @@ -358,22 +449,37 @@ class Dataset(object): #def __call__(self): # return self.generator() - def _infer_shapes_and_types(self): + def _infer_shapes_and_types_and_schema(self): if self.generator is not None and self.name is not None: log.info('Try to infer data shapes & types from generator') - first_value = next(self.generator()) + first_gen = self.generator() + first_value = next(first_gen) + first_value, self._data_schema = flatten(first_value) shapes, types = [], [] for v in first_value: if not isinstance(v, np.ndarray): raise ValueError( 'dataset generator should use numpy elements, got %s' % first_value) - shapes.append(v.shape) + # use black magic to keep the same dataset shape. + shapes.append([(i > 1) + 1 for i in v.shape]) types.append(v.dtype.name) self._data_shapes = shapes self._data_types = types log.info('Dataset `%s` has data_shapes: %s data_types: %s' % (self.name, repr(shapes), repr(types))) + original_generator = self.generator + self.is_first_call = True + + def _gen(): + if self.is_first_call: + self.is_first_call = False + generator = itertools.chain([first_value], first_gen) + else: + generator = original_generator() + yield from generator + + self.generator = _gen else: raise ValueError( 'Try to infer data shapes or types from incomplete Dataset') @@ -382,7 +488,7 @@ class Dataset(object): def data_shapes(self): """doc""" if self._data_shapes is None: - self._infer_shapes_and_types() + self._infer_shapes_and_types_and_schema() return self._data_shapes else: return self._data_shapes @@ -392,11 +498,28 @@ class Dataset(object): """doc""" self._data_shapes = val + @property + def data_schema(self): + """doc""" + if self._data_schema is None: + if self._data_shapes is not None and self._data_types is not None: + self._data_schema = [i for i in range(len(self._data_shapes))] + else: + self._infer_shapes_and_types_and_schema() + return self._data_schema + else: + return self._data_schema + + @data_schema.setter + def data_schema(self, val): + """doc""" + self._data_schema = val + @property def data_types(self): """doc""" if self._data_types is None: - self._infer_shapes_and_types() + self._infer_shapes_and_types_and_schema() return self._data_types else: return self._data_types @@ -448,6 +571,10 @@ class Dataset(object): _shard_func, num_shards=num_shards, index=index) return self.apply(func) + def chunk(self, num_shards): + func = functools.partial(_chunk_func, num_shards=num_shards) + return self.apply(func) + def interleave(self, map_fn, cycle_length, block_length): """doc""" func = functools.partial( @@ -461,13 +588,18 @@ class Dataset(object): func = functools.partial(_batch_func, batch_size=batch_size) return self.apply(func) - def padded_batch(self, batch_size, pad_value=0, max_seqlen=None): + def padded_batch(self, + batch_size, + pad_value=0, + max_seqlen=None, + droplast=False): """doc""" func = functools.partial( _padded_batch_func, batch_size=batch_size, pad_value=pad_value, - max_seqlen=max_seqlen) + max_seqlen=max_seqlen, + droplast=droplast) return self.apply(func) def take(self, count=1): @@ -483,3 +615,19 @@ class Dataset(object): def chain(self, other): func = functools.partial(_chain_func, dataset2=other.generator) return self.apply(func) + + def cache_shuffle_shard(self, + num_shards, + index, + seed=0, + drop_last=True, + repeat=-1): + func = functools.partial( + _cache_shuffle_shard_func, + num_shards=num_shards, + index=index, + seed=seed, + repeat=repeat, + drop_last=drop_last, ) + + return self.apply(func) diff --git a/propeller/paddle/__init__.py b/propeller/paddle/__init__.py index df4040a..fb0cd52 100644 --- a/propeller/paddle/__init__.py +++ b/propeller/paddle/__init__.py @@ -30,7 +30,6 @@ def enable_textone(): except ImportError: log.fatal('enable textone failed: textone not found!') raise - global textone_enabled log.info('textone enabled') from propeller.paddle.train.monitored_executor import MonitoredExecutor, TextoneTrainer if TextoneTrainer is None: @@ -38,6 +37,14 @@ def enable_textone(): MonitoredExecutor.saver_class = TextoneTrainer +def enable_oldstypled_ckpt(): + log.warn('enabling old_styled_ckpt') + from propeller.paddle.train.monitored_executor import MonitoredExecutor, Saver + MonitoredExecutor.saver_class = Saver + + +enable_oldstypled_ckpt() + from propeller.types import * from propeller.util import ArgumentParser, parse_hparam, parse_runconfig, parse_file @@ -46,7 +53,6 @@ from propeller.paddle import train from propeller.paddle.train import * import paddle -paddle_version = [int(i) for i in paddle.__version__.split('.')] -if paddle_version[1] < 7: - raise RuntimeError('propeller 0.2 requires paddle 1.7+, got %s' % +if paddle.__version__ != '0.0.0' and paddle.__version__ < '2.0.0': + raise RuntimeError('propeller 0.2 requires paddle 2.0+, got %s' % paddle.__version__) diff --git a/propeller/paddle/data/__init__.py b/propeller/paddle/data/__init__.py index 4b5a78f..2d98ce7 100644 --- a/propeller/paddle/data/__init__.py +++ b/propeller/paddle/data/__init__.py @@ -19,4 +19,5 @@ from __future__ import absolute_import from __future__ import unicode_literals from propeller.paddle.data.functional import * -from propeller.paddle.data.feature_column import * +from propeller.data.feature_column import * # columns +from propeller.paddle.data.feature_column import * #feature_column diff --git a/propeller/paddle/data/feature_column.py b/propeller/paddle/data/feature_column.py index b4bbdc0..46a6623 100644 --- a/propeller/paddle/data/feature_column.py +++ b/propeller/paddle/data/feature_column.py @@ -28,459 +28,31 @@ import logging import numpy as np from glob import glob -from propeller.paddle.train import distribution -from propeller.data.functional import _interleave_func +from propeller.data.feature_column import FeatureColumns as FCBase from propeller.paddle.data.functional import Dataset -from propeller.paddle.data import example_pb2, feature_pb2 import multiprocessing log = logging.getLogger(__name__) -__all__ = [ - 'FeatureColumns', 'TextColumn', 'TextIDColumn', 'LabelColumn', - 'RawBytesColumn', 'basic_tokenizer', 'Column' -] +__all__ = ['FeatureColumns'] -def basic_tokenizer(sen): - """doc""" - seg = sen.split(b' ') - seg = filter(lambda i: i != b' ', seg) - return seg - - -class Column(object): - """doc""" - - def __init__(self, name): - """doc""" - pass - - def raw_to_proto(self, raw): - """doc""" - return feature_pb2.Feature() - - @property - def output_shapes(self): - """doc""" - pass - - @property - def output_types(self): - """doc""" - pass - - def proto_to_instance(self, proto): - """doc""" - raise NotImplementedError() - - def raw_to_instance(self, raw): - """doc""" - raise NotImplementedError() - - -class LabelColumn(Column): - """doc""" - - def __init__(self, name, vocab_dict=None, vocab_file=None): - """doc""" - self.name = name - self.vocab = None - if vocab_file: - self.vocab = { - j.strip(): i - for i, j in enumerate(open(vocab_file, 'rb').readlines()) - } - if vocab_dict: - self.vocab = vocab_dict - - @property - def output_shapes(self): - """doc""" - return [1] - - @property - def output_types(self): - """doc""" - return 'int64' - - def raw_to_proto(self, raw): - """doc""" - if self.vocab is None: - ids = [int(raw)] - else: - ids = [self.vocab[raw]] - fe = feature_pb2.Feature(int64_list=feature_pb2.Int64List(value=ids)) - return fe - - def proto_to_instance(self, feature): - """doc""" - ret = np.array(feature.int64_list.value[0], dtype=np.int64) - return ret - - def raw_to_instance(self, raw): - """doc""" - if self.vocab is None: - ids = int(raw) - else: - ids = self.vocab[raw] - return np.array(ids, dtype=np.int64) - - -class TextColumn(Column): - """doc""" - - def __init__(self, - name, - unk_id, - vocab_file=None, - vocab_dict=None, - tokenizer=basic_tokenizer): - self.name = name - self.tokenizer = tokenizer - self.unk_id = unk_id - if not (vocab_file or vocab_dict): - raise ValueError('at least specify vocab_file or vocab_dict') - if vocab_file: - self.vocab = { - j.strip(): i - for i, j in enumerate(open(vocab_file, 'rb').readlines()) - } - if vocab_dict: - self.vocab = vocab_dict - - @property - def output_shapes(self): - """doc""" - return [-1] - - @property - def output_types(self): - """doc""" - return 'int64' - - def raw_to_proto(self, raw): - """doc""" - ids = [ - s if isinstance(s, int) else self.vocab.get(s, self.unk_id) - for s in self.tokenizer(raw) - ] - fe = feature_pb2.Feature(int64_list=feature_pb2.Int64List(value=ids)) - return fe - - def proto_to_instance(self, feature): - """doc""" - ret = np.array(feature.int64_list.value, dtype=np.int64) - return ret - - def raw_to_instance(self, raw): - """doc""" - ids = [ - s if isinstance(s, int) else self.vocab.get(s, self.unk_id) - for s in self.tokenizer(raw) - ] - return np.array(ids, dtype=np.int64) - - -class RawBytesColumn(Column): - def __init__(self, name): - self.name = name - - @property - def output_shapes(self): - """doc""" - return [-1] - - @property - def output_types(self): - """doc""" - return 'bytes' - - def raw_to_proto(self, raw): - """doc""" - fe = feature_pb2.Feature(bytes_list=BytesList(value=[raw])) - return fe - - def proto_to_instance(self, feature): - """doc""" - ret = feature.bytes_list.value[ - 0] #np.array(feature.int64_list.value, dtype=np.int64) - return ret - - def raw_to_instance(self, raw): - """doc""" - return raw - - -class TextIDColumn(Column): - """doc""" - - def __init__(self, name): - """doc""" - self.name = name - - @property - def output_shapes(self): - """doc""" - return [-1] - - @property - def output_types(self): - """doc""" - return 'int64' - - def raw_to_proto(self, raw): - """doc""" - ids = [int(s) for s in raw.split(b' ')] - fe = feature_pb2.Feature(int64_list=feature_pb2.Int64List(value=ids)) - return fe - - def proto_to_instance(self, feature): - """doc""" - ret = np.array(feature.int64_list.value, dtype=np.int64) - return ret - - def raw_to_instance(self, raw): - """doc""" - ret = np.array([int(i) for i in raw.split(b' ')], dtype=np.int64) - return ret - - -def _list_files(raw_dir): - return [os.path.join(raw_dir, p) for p in os.listdir(raw_dir)] - - -_columns = None - - -def _init_worker(col): - global _columns - _columns = col - - -def _worker_entrence(args): - args = (_columns, ) + args - return _make_gz(args) - - -class FeatureColumns(object): +class FeatureColumns(FCBase): """A Dataset Factory object""" - def __init__(self, columns): - """doc""" - self._columns = columns - - def _make_gz_dataset(self, raw_dir, gz_dir): - assert raw_dir or gz_dir, 'data_dir not specified when using gz mode' - if raw_dir is not None: - assert os.path.exists(raw_dir), 'raw_dir not exists: %s' % raw_dir - raw_file = os.listdir(raw_dir) - if gz_dir is None: - gz_dir = '%s_gz' % raw_dir.rstrip('/') - - if not os.path.exists(gz_dir): - os.mkdir(gz_dir) - - if raw_dir is not None: - if len(raw_file) != 0: - log.debug('try making gz') - pool = multiprocessing.Pool( - initializer=_init_worker, - initargs=(self._columns, )) - args = [(os.path.join(raw_dir, f), os.path.join(gz_dir, f), - b'\t') for f in raw_file] - pool.map(_worker_entrence, args) - pool.close() - pool.join() - else: - assert len( - os.listdir(gz_dir) - ) != 0, 'cant find gz file or raw-txt file at [%s] and [%s]' % ( - raw_dir, gz_dir) - return gz_dir - - def _read_gz_dataset(self, - gz_files, - shuffle=False, - repeat=True, - shard=False, - **kwargs): - if len(gz_files) == 0: - raise ValueError('reading gz from empty file list: %s' % gz_files) - log.info('reading gz from %s' % '\n'.join(gz_files)) - dataset = Dataset.from_list(gz_files) - if repeat: - dataset = dataset.repeat() - - if shard and distribution.status.mode == distribution.DistributionMode.NCCL: - log.info('Apply dataset sharding in distribution env') - train_ds = train_ds.shard(distribution.status.num_replica, - distribution.status.replica_id) - - if shuffle: - dataset = dataset.shuffle(buffer_size=len(gz_files)) - fn = partial( - _interleave_func, - map_fn=lambda filename: Dataset.from_record_file(filename), - cycle_length=len(gz_files), - block_length=1) - dataset = dataset.apply(fn) - if shuffle: - dataset = dataset.shuffle(buffer_size=1000) - - def _parse_gz(record_str): # function that takes python_str as input - ex = example_pb2.Example() - ex.ParseFromString(record_str) - ret = [] - fea_dict = ex.features.feature - for c in self._columns: - ins = c.proto_to_instance(fea_dict[c.name]) - ret.append(ins) - return ret - - dataset = dataset.map(_parse_gz) - return dataset - - def _read_txt_dataset(self, - data_files, - shuffle=False, - repeat=True, - **kwargs): - log.info('reading raw files from %s' % '\n'.join(data_files)) - dataset = Dataset.from_list(data_files) - if repeat: - dataset = dataset.repeat() - if shuffle: - dataset = dataset.shuffle(buffer_size=len(data_files)) - - fn = partial( - _interleave_func, - map_fn=lambda filename: Dataset.from_file(filename), - cycle_length=len(data_files), - block_length=1) - dataset = dataset.apply(fn) - if shuffle: - dataset = dataset.shuffle(buffer_size=1000) - - def _parse_txt_file( - record_str): # function that takes python_str as input - features = record_str.strip(b'\n').split(b'\t') - ret = [ - column.raw_to_instance(feature) - for feature, column in zip(features, self._columns) - ] - return ret - - dataset = dataset.map(_parse_txt_file) - return dataset - - def _read_stdin_dataset(self, encoding='utf8', shuffle=False, **kwargs): - log.info('reading raw files stdin') - - def _gen(): - if six.PY3: - source = sys.stdin.buffer - else: - source = sys.stdin - while True: - line = source.readline() - if len(line) == 0: - break - yield line, - - dataset = Dataset.from_generator_func(_gen) - if shuffle: - dataset = dataset.shuffle(buffer_size=1000) - - def _parse_stdin(record_str): - """function that takes python_str as input""" - features = record_str.strip(b'\n').split(b'\t') - ret = [ - column.raw_to_instance(feature) - for feature, column in zip(features, self._columns) - ] - return ret - - dataset = dataset.map(_parse_stdin) - return dataset - - def _prepare_dataset(self, - dataset, - map_func_before_batch=None, - map_func_after_batch=None, - shuffle_buffer_size=None, - batch_size=1, - pad_id=0, - prefetch=None, - **kwargs): - - if map_func_before_batch is not None: - dataset = dataset.map(map_func_before_batch) - if batch_size: - dataset = dataset.padded_batch(batch_size, pad_id) - if map_func_after_batch is not None: - dataset = dataset.map(map_func_after_batch) - return dataset - - def build_dataset(self, - name, - use_gz=True, - data_dir=None, - gz_dir=None, - data_file=None, - **kwargs): + def build_dataset(self, *args, **kwargs): """ build `Dataset` from `data_dir` or `data_file` if `use_gz`, will try to convert data_files to gz format and save to `gz_dir`, if `gz_dir` not given, will create one. """ - if use_gz: - gz_dir = self._make_gz_dataset(data_dir, gz_dir) - gz_files = _list_files(gz_dir) if gz_dir is not None else gz_dir - ds = self._read_gz_dataset(gz_files, **kwargs) - else: - if data_dir is not None: - data_files = _list_files(data_dir) - elif data_file is not None: - data_files = [data_file] - else: - raise ValueError('data_dir or data_files not specified') - ds = self._read_txt_dataset(data_files, **kwargs) - ds.name = name + ds = super(FeatureColumns, self).build_dataset(*args, **kwargs) + ds.__class__ = Dataset return ds - def build_dataset_from_stdin(self, name, **kwargs): + def build_dataset_from_stdin(self, *args, **kwargs): """doc""" - ds = self._read_stdin_dataset(**kwargs) - ds.name = name + ds = super(FeatureColumns, self).build_dataset_from_stdin(*args, + **kwargs) + ds.__class__ = Dataset return ds - - -def _make_gz(args): - try: - columns, from_file, to_file, sep = args - if os.path.exists(to_file): - return - with open(from_file, 'rb') as fin, gzip.open(to_file, 'wb') as fout: - log.debug('making gz %s => %s' % (from_file, to_file)) - for i, line in enumerate(fin): - line = line.strip(b'\n').split(sep) - #if i % 10000 == 0: - # log.debug('making gz %s => %s [%d]' % (from_file, to_file, i)) - if len(line) != len(columns): - log.error('columns not match at %s, got %d, expect %d' % - (from_file, len(line), len(columns))) - continue - features = {} - for l, c in zip(line, columns): - features[c.name] = c.raw_to_proto(l) - example = example_pb2.Example(features=feature_pb2.Features( - feature=features)) - serialized = example.SerializeToString() - l = len(serialized) - data = struct.pack('i%ds' % l, l, serialized) - fout.write(data) - log.debug('done making gz %s => %s' % (from_file, to_file)) - except Exception as e: - log.exception(e) - raise e diff --git a/propeller/paddle/data/functional.py b/propeller/paddle/data/functional.py index 6ac724a..c92a439 100644 --- a/propeller/paddle/data/functional.py +++ b/propeller/paddle/data/functional.py @@ -21,11 +21,13 @@ import paddle.fluid as F import paddle.fluid.layers as L from propeller.data.functional import Dataset as DatasetBase +from propeller.data.functional import flatten +from paddle.io import IterableDataset as PDataset log = logging.getLogger(__name__) -class Dataset(DatasetBase): +class Dataset(DatasetBase, PDataset): """Pyreader based Dataset""" def placeholders(self): @@ -64,6 +66,7 @@ class Dataset(DatasetBase): def _gen(): try: for idx, i in enumerate(self.generator()): + i, _ = flatten(i) yield i except Exception as e: log.exception(e) diff --git a/propeller/paddle/train/distribution.py b/propeller/paddle/train/distribution.py index d50df19..f2a53fa 100644 --- a/propeller/paddle/train/distribution.py +++ b/propeller/paddle/train/distribution.py @@ -172,3 +172,21 @@ def init_distribuition_env(program): (repr(dis_config))) if status.is_master: sleep(30) + + +def allgather(X): + if status.mode == DistributionMode.LOCAL: + return X + Xs = [] + for i in range(status.num_replica): + copy_X = X * 1 + copy_X.stop_gradient = True + #L.Print(copy_X) + copy_X = L.collective._broadcast(copy_X, i, True) + if i != status.replica_id: + copy_X.stop_gradient = True + else: + copy_X.stop_gradient = False + Xs.append(copy_X) + Xs = L.concat(Xs, 0) + return Xs diff --git a/propeller/paddle/train/hooks.py b/propeller/paddle/train/hooks.py index e3820a7..0550483 100644 --- a/propeller/paddle/train/hooks.py +++ b/propeller/paddle/train/hooks.py @@ -45,7 +45,7 @@ class RunHook(object): """doc""" pass - def before_train(self, program): + def before_train(self, program, state): """doc""" pass @@ -61,7 +61,7 @@ class RunHook(object): """doc""" return False - def after_train(self): + def after_train(self, program, state): """doc""" pass @@ -144,12 +144,13 @@ class TqdmNotebookProgressBarHook(RunHook): class LoggingHook(RunHook): - """log tensor in to screan and VisualDL""" + """log tensor in to screan and tensorboard""" def __init__(self, loss, per_step=10, skip_step=100, + prefix='training', summary_writer=None, summary_record=None): """doc""" @@ -158,12 +159,13 @@ class LoggingHook(RunHook): (per_step, skip_step)) self.loss = loss self.per_step = per_step + self.prefix = prefix self.skip_step = skip_step self.summary_record = summary_record self.writer = summary_writer self.last_state = None - def before_train(self, program): + def before_train(self, program, _): """doc""" if self.summary_record: if self.summary_record.scalar: @@ -205,7 +207,7 @@ class LoggingHook(RunHook): speed = -1. self.last_state = state - # log to VisualDL + # log to tensorboard if self.writer is not None: self.writer.add_scalar('loss', loss, state.gstep) for name, t in zip(self.s_name, s_np): @@ -225,6 +227,7 @@ class LoggingHook(RunHook): # log to stdout log.debug('\t'.join([ + '[%s]' % self.prefix, 'step: %d' % state.gstep, 'steps/sec: %.5f' % speed, 'loss: %.5f' % loss, @@ -232,6 +235,10 @@ class LoggingHook(RunHook): map(lambda t: '%s:%s' % t, zip(self.s_name, s_np))), ])) + def after_train(self, program, state): + if self.writer is not None: + self.writer.close() + class StopAtStepHook(RunHook): """stop training at some step""" @@ -274,7 +281,7 @@ class EvalHook(RunHook): else: self.names, self.metrics = [], [] - def before_train(self, program): + def before_train(self, program, _): """doc""" for m in self.metrics: m.reset() @@ -307,9 +314,8 @@ class EvalHook(RunHook): """doc""" return self._result - def after_train(self): + def after_train(self, program, state): """doc""" - printable = [] self._result = {} for n, m in zip(self.names, self.metrics): val = m.eval() @@ -332,3 +338,6 @@ class CheckpointSaverHook(RunHook): if state.gstep % self.per_step == 0 and \ state.step > self.skip_step: self.saver.save(state) + + def after_train(self, program, state): + self.saver.save(state) diff --git a/propeller/paddle/train/metrics.py b/propeller/paddle/train/metrics.py index db96e52..a34e1b7 100644 --- a/propeller/paddle/train/metrics.py +++ b/propeller/paddle/train/metrics.py @@ -25,6 +25,8 @@ import paddle.fluid as F import paddle.fluid.layers as L import sklearn.metrics +from propeller.paddle.train import distribution #import allgather, status, DistributionMode + log = logging.getLogger(__name__) __all__ = [ @@ -33,17 +35,33 @@ __all__ = [ ] +def _allgather_2dim(*args): + log.info('distribution.status.mode : {}'.format(distribution.status.mode)) + if distribution.status.mode == distribution.DistributionMode.LOCAL: + return args + + if distribution.status.num_replica == 1: + return args + + for a in args: + if len(a.shape) > 2: + log.warn( + 'Metrics:%s have shape %s, cannot not be allgathered, will return to single card evaluation' + % (a, a.shape)) + else: + pass + #log.debug('broadcast %s' % a) + ret = [distribution.allgather(a) if len(a.shape) <= 2 else a for a in args] + return ret + + class Metrics(object): """Metrics base class""" def __init__(self): """doc""" self.saver = [] - - @property - def tensor(self): - """doc""" - pass + self.tensor = None def update(self, *args): """doc""" @@ -59,7 +77,7 @@ class Mean(Metrics): def __init__(self, t): """doc""" - self.t = t + self.t = _allgather_2dim(t) self.reset() def reset(self): @@ -69,7 +87,7 @@ class Mean(Metrics): @property def tensor(self): """doc""" - return self.t, + return self.t def update(self, args): """doc""" @@ -79,6 +97,7 @@ class Mean(Metrics): def eval(self): """doc""" + log.debug(self.saver.shape) return self.saver.mean() @@ -99,13 +118,13 @@ class Acc(Mean): raise ValueError( 'expect label shape == pred shape, got: label.shape=%s, pred.shape = %s' % (repr(label), repr(pred))) - self.eq = L.equal(pred, label) + self.eq = _allgather_2dim(L.cast(L.equal(pred, label), 'int64')) self.reset() @property def tensor(self): """doc""" - return self.eq, + return self.eq class MSE(Mean): @@ -169,7 +188,7 @@ class MacroF1(Metrics): @property def tensor(self): """doc""" - return self.label, self.pred + return [self.label, self.pred] def update(self, args): """doc""" @@ -202,20 +221,16 @@ class Precision(Metrics): self.label = label self.pred = pred self.reset() + self.tensor = _allgather_2dim(self.pred, self.label) def reset(self): """doc""" self.label_saver = np.array([], dtype=np.bool) self.pred_saver = np.array([], dtype=np.bool) - @property - def tensor(self): - """doc""" - return self.label, self.pred - def update(self, args): """doc""" - label, pred = args + pred, label = args label = label.reshape([-1]).astype(np.bool) pred = pred.reshape([-1]).astype(np.bool) if label.shape != pred.shape: @@ -255,6 +270,81 @@ class F1(Precision): return 2 * precision * recall / (precision + recall + 1.e-6) +class MicroF1(Precision): + """doc""" + + def update(self, args): + """doc""" + label, pred = args + label = label.reshape([-1]) + pred = pred.reshape([-1]) + if label.shape != pred.shape: + raise ValueError('Metrics f1: input not match: label:%s pred:%s' % + (label, pred)) + self.label_saver = np.concatenate([self.label_saver, label]) + self.pred_saver = np.concatenate([self.pred_saver, pred]) + + def eval(self): + """doc""" + return sklearn.metrics.f1_score( + self.label_saver, self.pred_saver, average='micro') + + +class MacroF1(Precision): + def eval(self): + """doc""" + return sklearn.metrics.f1_score( + self.label_saver, self.pred_saver, average='macro') + + +class MCC(Precision): + """mathew corelation coefitient""" + + def eval(self): + """doc""" + return sklearn.metrics.matthews_corrcoef(self.label_saver, + self.pred_saver) + + +class PCC(Metrics): + """pearson corelation coefitient""" + + def __init__(self, label, pred): + """doc""" + if label.shape != pred.shape: + raise ValueError( + 'expect label shape == pred shape, got: label.shape=%s, pred.shape = %s' + % (repr(label), repr(pred))) + + from scipy.stats import pearsonr + self.pearsonr = pearsonr + self.label = label + self.pred = pred + self.tensor = _allgather_2dim(self.pred, self.label) + self.reset() + + def reset(self): + """doc""" + self.label_saver = np.array([], dtype=np.float) + self.pred_saver = np.array([], dtype=np.float) + + def update(self, args): + """doc""" + pred, label = args + label = label.reshape([-1]).astype(np.float) + pred = pred.reshape([-1]).astype(np.float) + if label.shape != pred.shape: + raise ValueError('input not match: label:%s pred:%s' % + (label, pred)) + self.label_saver = np.concatenate([self.label_saver, label]) + self.pred_saver = np.concatenate([self.pred_saver, pred]) + + def eval(self): + """doc""" + p, _ = self.pearsonr(self.label_saver, self.pred_saver) + return p + + class Auc(Metrics): """doc""" @@ -267,6 +357,7 @@ class Auc(Metrics): self.pred = pred self.label = label + self.tensor = _allgather_2dim(self.pred, self.label) self.reset() def reset(self): @@ -274,11 +365,6 @@ class Auc(Metrics): self.pred_saver = np.array([], dtype=np.float32) self.label_saver = np.array([], dtype=np.bool) - @property - def tensor(self): - """doc""" - return [self.pred, self.label] - def update(self, args): """doc""" pred, label = args @@ -289,12 +375,37 @@ class Auc(Metrics): def eval(self): """doc""" + log.debug(self.pred_saver.shape) fpr, tpr, thresholds = sklearn.metrics.roc_curve( self.label_saver.astype(np.int64), self.pred_saver) auc = sklearn.metrics.auc(fpr, tpr) return auc +class BestAcc(Auc): + """doc""" + + def eval(self): + """doc""" + thres = np.unique(self.pred_saver) + best_thre = -1 + best_acc = -1 + + num = 10000 + gap = len(thres) // num + if gap > 0: + thres = thres[::gap] + + for thre in thres: + acc = 1. * np.sum( + (self.pred_saver > thre + ) == self.label_saver.astype(np.bool)) / len(self.pred_saver) + if acc > best_acc: + best_thre = thre + best_acc = acc + return best_acc + + class RecallAtPrecision(Auc): """doc""" @@ -533,9 +644,7 @@ class PNRatio(Metrics): 'expect label shape == pred shape, got: label.shape=%s, pred.shape = %s' % (repr(label), repr(pred))) - self.qid = qid - self.label = label - self.pred = pred + self.qid, self.label, self.pred = _allgather_2dim(qid, label, pred) self.saver = {} def reset(self): @@ -581,7 +690,7 @@ class PNRatio(Metrics): p += 1 elif p1 > p2: n += 1 - pn = p / n if n > 0 else 0.0 + pn = 1. * p / n if n > 0 else 0.0 return np.float32(pn) diff --git a/propeller/paddle/train/monitored_executor.py b/propeller/paddle/train/monitored_executor.py index 96832d7..ddf0c42 100644 --- a/propeller/paddle/train/monitored_executor.py +++ b/propeller/paddle/train/monitored_executor.py @@ -24,8 +24,11 @@ import sys import json from functools import reduce import six -from time import time +#from time import time +import time import shutil +import tarfile +import tempfile import logging import numpy as np @@ -65,7 +68,7 @@ class RunState(object): def __init__(self): """doc""" - self.__dict__ = {'gstep': 0, 'step': 0, 'time': time()} + self.__dict__ = {'gstep': 0, 'step': 0, 'time': time.time()} @property def gstep(self): @@ -107,7 +110,7 @@ class RunState(object): self.__dict__, gstep=self.gstep + 1, step=self.step + 1, - time=time()) + time=time.time()) ret = RunState() ret.__dict__ = newd return ret @@ -121,8 +124,10 @@ class Saver(object): exe, program, save_prefix='model', - max_ckpt_to_keep=None): + max_ckpt_to_keep=None, + save_tarckpt=False): """doc""" + self.save_tarckpt = save_tarckpt assert isinstance( exe, F.Executor ), 'expect normal executor to save, got executor of type %s' % repr( @@ -177,6 +182,10 @@ class Saver(object): 'can not load model from %s, is this a textone checkpoint?' % dir) + def tarball(self, src_dir, output_name): + with tarfile.open(output_name, "w:") as tar: + tar.add(src_dir, arcname=os.path.basename(src_dir)) + def save(self, state): """doc""" save_name = '%s_%d' % (self._save_prefix, state.gstep) @@ -189,10 +198,20 @@ class Saver(object): pass log.debug('saving step %d to %s' % (state.gstep, save_dir)) self._save_program(tmp_dir) + shutil.move(tmp_dir, save_dir) meta = state.serialize() open(os.path.join(save_dir, 'meta'), 'w').write(meta) + if self.save_tarckpt: + now = time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())) + save_dir_tar = save_dir + '_' + now + '.tar' + tar_name = os.path.basename(save_dir_tar) + log.debug('taring %s to %s' % (save_dir, save_dir_tar)) + self.tarball(save_dir, save_dir_tar) + shutil.rmtree(save_dir) + save_name = tar_name + self.ckpt_list.append(save_name) if len(self.ckpt_list) > self._max_ckpt_to_keep: ckpt_to_keep = self.ckpt_list[-self._max_ckpt_to_keep:] @@ -201,7 +220,9 @@ class Saver(object): for ckpt in ckpt_to_remove: ckpt_dir = os.path.join(self._save_dir, ckpt) if os.path.exists(ckpt_dir): - shutil.rmtree(ckpt_dir) + rm = shutil.rmtree if os.path.isdir( + ckpt_dir) else os.remove + rm(ckpt_dir) log.debug('No. of ckpt exceed %d, clean up: %s' % (self._max_ckpt_to_keep, ckpt_dir)) open(self.ckpt_info_path, 'w').write('\n'.join(self.ckpt_list)) @@ -220,6 +241,17 @@ class Saver(object): else: raise ValueError('ckpt type not understood %s' % repr(ckpt)) + if os.path.isfile(path) and tarfile.is_tarfile(path): + log.info('restore from tar file : {}'.format(path)) + tf = tarfile.open(path) + dirs = [m for m in tf.getmembers() if m.isdir()] + assert (len(dirs) == 1), dirs + tmp_dir = tempfile.mkdtemp() + log.info('extracting to : {}'.format(tmp_dir)) + tf.extractall(tmp_dir) + path = os.path.join(tmp_dir, dirs[0].name) + log.info('model path : {}'.format(path)) + meta_file = os.path.join(path, 'meta') if not os.path.exists(meta_file): raise RuntimeError('meta not found in restore dir: %s' % path) @@ -236,16 +268,20 @@ class SaverV2(Saver): F.save(self._program.train_program, save_path) def _load_program(self, dir, predicate_fn=None): - try: - save_path = os.path.join(dir, 'ckpt') - F.load( - self._program.train_program, - save_path, ) - except F.core.EnforceNotMet as e: - log.exception(e) - raise RuntimeError( - 'can not load model from %s, is this a textone checkpoint?' % - dir) + save_path = os.path.join(dir, 'ckpt') + if not os.path.exists(save_path + '.pdparams'): + try: + log.warn('failed to load model, try old-styled saver') + super(SaverV2, self)._load_program( + dir, predicate_fn=predicate_fn) + except F.core.EnforceNotMet as e: + log.exception(e) + raise RuntimeError( + 'can not load model from %s, is this a textone checkpoint?' + % dir) + else: + sd = F.load_program_state(save_path) + F.set_program_state(self._program.train_program, sd) TextoneTrainer = None @@ -263,7 +299,7 @@ class MonitoredExecutor(object): state=None, run_config=None, #none if not load run_hooks=[], - warm_start_setting=None): + warm_start_setting=None, ): if not isinstance(executor, F.Executor): raise ValueError('PE is no longer supported') if isinstance(executor, F.ParallelExecutor): @@ -285,6 +321,10 @@ class MonitoredExecutor(object): self._skip_steps = run_config.skip_steps if run_config.skip_steps else 100 self._save_prefix = 'model' self._max_ckpt = run_config.max_ckpt + self._save_tarckpt = False + if hasattr(run_config, + 'save_tarckpt') and run_config.save_tarckpt is True: + self._save_tarckpt = True @property def state(self): @@ -306,7 +346,8 @@ class MonitoredExecutor(object): self._model_dir, F.Executor(_get_one_place()), program=self._program, - max_ckpt_to_keep=self._max_ckpt) + max_ckpt_to_keep=self._max_ckpt, + save_tarckpt=self._save_tarckpt) if self._warm_start_setting is not None: if not os.path.exists(self._warm_start_setting.from_dir): @@ -316,7 +357,6 @@ class MonitoredExecutor(object): if isinstance(self._warm_start_setting, WarmStartSetting): log.info("warm start from %s" % self._warm_start_setting.from_dir) - log.info(self._saver) if (not type(self._saver) is Saver) and ( not type(self._saver) is SaverV2): raise ValueError( @@ -330,17 +370,8 @@ class MonitoredExecutor(object): log.info('warm start: %s' % v.name) return ret - try: - F.io.load_vars( - self._exe, - self._warm_start_setting.from_dir, - main_program=self._program.train_program, - predicate=_fn) - except F.core.EnforceNotMet as e: - log.exception(e) - raise RuntimeError( - 'can not load model from %s, is this a textone checkpoint?' - % dir) + self._saver._load_program( + self._warm_start_setting.from_dir, predicate_fn=_fn) else: raise NotImplementedError() elif isinstance(self._warm_start_setting, TextoneWarmStartSetting): @@ -366,10 +397,10 @@ class MonitoredExecutor(object): will do nothing if loss is None i.e. not in train mode """ if self._loss is None: - log.debug('will not freeze a program without loss') + #log.debug('will not freeze a program without loss') return if isinstance(self._program.train_program, F.compiler.CompiledProgram): - log.debug('program has already been built') + #log.debug('program has already been built') return exec_strategy = F.ExecutionStrategy() exec_strategy.num_threads = 4 #2 for fp32 4 for fp16 @@ -378,6 +409,7 @@ class MonitoredExecutor(object): build_strategy = F.BuildStrategy() build_strategy.remove_unnecessary_lock = False + build_strategy.enable_sequential_execution = True # prevent hang #build_strategy.fuse_broadcast_ops = True build_strategy.num_trainers = distribution.status.num_replica build_strategy.trainer_id = distribution.status.replica_id @@ -413,7 +445,7 @@ class MonitoredExecutor(object): self.result = None for h in self._hooks: log.debug('train loop has hook %s' % h) - h.before_train(self._program) + h.before_train(self._program, self._state) return self def run(self, fetch_list=[], *args, **kwargs): @@ -469,7 +501,8 @@ class MonitoredExecutor(object): log.info('********** Stop Loop ************') self.result = [] for h in self._hooks: - self.result.append(h.after_train()) + self.result.append( + h.after_train(self._program, self._state)) except Exception as e: log.exception('error occur after loop %s' % repr(e)) else: diff --git a/propeller/paddle/train/trainer.py b/propeller/paddle/train/trainer.py index 8388041..6c0fefa 100644 --- a/propeller/paddle/train/trainer.py +++ b/propeller/paddle/train/trainer.py @@ -29,6 +29,7 @@ from time import time import paddle.fluid as F import paddle.fluid.layers as L +from propeller.data.functional import unflatten from propeller.types import RunMode, StopException, SummaryRecord, StopException from propeller.types import ModelSpec, InferenceSpec, ProgramPair, RunConfig from propeller.paddle import summary, collection @@ -48,11 +49,12 @@ __all__ = ['train_and_eval', 'Learner'] def _get_summary_writer(path): summary_writer = None try: - from visualdl import LogWriter + #from tensorboardX import SummaryWriter + from visualdl import LogWriter as SummaryWriter if distribution.status.is_master: - summary_writer = LogWriter(os.path.join(path)) + summary_writer = SummaryWriter(os.path.join(path)) except ImportError: - log.warning('VisualDL not installed, will not log to VisualDL') + log.warning('Visual DL not installed, will not log to tensorboard') return summary_writer @@ -65,31 +67,30 @@ def _log_eval_result(name, eval_result, swriter, state): log.debug(eval_result) printable = [] for n, val in six.iteritems(eval_result): - assert val.shape == (), 'metrics eval use float' - printable.append('{}\t{}'.format(n, val)) + #assert val.shape == (), 'metrics eval use float' + printable.append('{}:{}'.format(n, val)) if swriter is not None: swriter.add_scalar(n, val, state.gstep) - log.debug('write to VisualDL %s' % swriter.logdir) + log.debug('write to tensorboard %s' % swriter.logdir) - if len(printable): - log.info('*** eval res: %10s ***' % name) - for p in printable: - log.info(p) - log.info('******************************') + if printable: + log.info('[Eval:%s]:' % name + '\t'.join(printable)) def _build_net(model_fn, features, mode, params, run_config): model_spec = model_fn( features=features, mode=mode, params=params, run_config=run_config) - if mode == RunMode.TRAIN: + if mode == RunMode.TRAIN or mode == RunMode.EVAL: if not isinstance(model_spec.loss, F.framework.Variable): raise ValueError('model_spec.metrics should be Variable, got %s' % repr(model_spec.loss)) if not (model_spec.loss.shape == () or model_spec.loss.shape == (1, )): raise ValueError('expect scarlar loss, got %s' % repr(model_spec.loss.shape)) - #model_spec.loss.persistable = True + + if mode == RunMode.TRAIN: + pass elif mode == RunMode.EVAL: if not isinstance(model_spec.metrics, dict): raise ValueError('model_spec.metrics should be dict, got %s' % @@ -154,6 +155,7 @@ class Learner(object): with collection.Collections() as collections: log.info('Building Train Graph...') fea = train_dataset.features() + fea = unflatten(fea, train_dataset.data_schema) model_spec = _build_net(self.model_fn, fea, RunMode.TRAIN, self.params, self.run_config) log.info('Building Train Graph: Done') @@ -188,10 +190,22 @@ class Learner(object): #share var with Train net log.info('Building Eval Graph') fea = ds.features() + fea = unflatten(fea, ds.data_schema) model_spec = _build_net(self.model_fn, fea, RunMode.EVAL, self.params, self.run_config) log.info('Done') #program = program.clone(for_test=True) + # program check + optimizer_ops = {'sgd', 'adam', 'adagrad'} + for op in program.global_block().ops: + if op.type == 'dropout': + op._set_attr('is_test', True) + if op.type == 'batch_norm': + op._set_attr('is_test', True) + if op.type in optimizer_ops: + raise RuntimeError('Found optimizer op in eval graph, op: %s' % + repr(op)) + log.info( 'Eval with: \n> Run_config: %s\n> Params: %s\n> Train_model_spec: %s\n' % (repr(self.run_config), repr(self.params), repr(model_spec))) @@ -206,10 +220,20 @@ class Learner(object): #share var with Train net log.info('Building Predict Graph') fea = ds.features() + fea = unflatten(fea, ds.data_schema) model_spec = _build_net(self.model_fn, fea, RunMode.PREDICT, self.params, self.run_config) log.info('Done') + optimizer_ops = {'sgd', 'adam', 'adagrad'} + for op in program.global_block().ops: + if op.type == 'dropout': + op._set_attr('is_test', True) + if op.type == 'batch_norm': + op._set_attr('is_test', True) + if op.type in optimizer_ops: + raise RuntimeError('Found optimizer op in eval graph, op: %s' % + repr(op)) #program = program.clone(for_test=True) log.info( @@ -235,6 +259,7 @@ class Learner(object): summary_writer=_get_summary_writer( os.path.join(self.run_config.model_dir, 'train_history')), per_step=self.run_config.log_steps, + prefix=self.run_config.log_prefix or 'training', skip_step=self.run_config.skip_steps), ] if model_spec.train_hooks is not None: @@ -259,7 +284,7 @@ class Learner(object): hooks.CheckpointSaverHook( mon_exe._saver, per_step=mon_exe._save_steps, - skip_step=mon_exe._skip_steps)) + skip_step=mon_exe._skip_steps, )) try: with mon_exe: @@ -292,13 +317,17 @@ class Learner(object): mon_exe = MonitoredExecutor( eval_executor, program, + loss=model_spec.loss, run_config=self.run_config, - run_hooks=eval_run_hooks) + run_hooks=eval_run_hooks, + warm_start_setting=self.warm_start_setting) + distribution.init_distribuition_env( + program) #only initialize distribute training with mon_exe.init_or_restore_variables() try: with mon_exe: - for data in eval_dataset.start(places=[single_card_place]): + for data in eval_dataset.start(): mon_exe.run(feed=data) except (StopException, F.core.EOFException) as e: pass @@ -309,7 +338,7 @@ class Learner(object): os.path.join(self.run_config.model_dir, 'eval_history')) _log_eval_result('eval', eval_result, summary_writer, mon_exe.state) - return mon_exe.result + return eval_result def predict(self, predict_dataset, @@ -351,8 +380,12 @@ class Learner(object): program, run_config=pred_run_config, warm_start_setting=self.warm_start_setting, ) - mon_exe.init_or_restore_variables(ckpt - if ckpt_path is None else ckpt_path) + mon_exe.init_or_restore_variables(ckpt) + if ckpt_path is not None: + if not os.path.exists(ckpt_path): + raise RuntimeError('ckpt path not found: %s' % ckpt_path) + log.info('Loading ckpt path for prediction: %s' % ckpt_path) + mon_exe._saver._load_program(ckpt_path) try: with mon_exe: log.info('Runining predict from dir: %s' % repr(mon_exe.state)) @@ -452,7 +485,7 @@ def train_and_eval(_placeholder=None, def after_run(self, _, state): """doc""" - if state.step > run_config.skip_steps and state.gstep % run_config.eval_steps == 0: + if state.gstep > run_config.skip_steps and state.gstep % run_config.eval_steps == 0: eval_results = {} for name, ds in six.iteritems(eval_dataset): ehooks = [ @@ -481,15 +514,21 @@ def train_and_eval(_placeholder=None, eval_results[name] = eval_res _log_eval_result(name, eval_res, self.summary_writers[name], state) - for exporter in exporters: - exporter.export(eval_executor, self.program, - self.model_spec, eval_results, state) + + if distribution.status.is_master: + for exporter in exporters: + exporter.export(eval_executor, self.program, + self.model_spec, eval_results, state) else: eval_results = {} return eval_results - if distribution.status.is_master: - train_hooks.append(_EvalHookOnTrainLoop()) + def after_train(self, _, __): + for _, w in six.iteritems(self.summary_writers): + if w: + w.close() + + train_hooks.append(_EvalHookOnTrainLoop()) res = est.train(train_dataset, train_hooks=train_hooks) return res @@ -497,7 +536,14 @@ def train_and_eval(_placeholder=None, def _build_model_fn(model_class): def _model_fn(features, mode, params, run_config): if mode != RunMode.PREDICT: - fea, label = features[:-1], features[-1] + if isinstance(features, list) or isinstance(features, tuple): + fea, label = features[:-1], features[-1] + elif isinstance(features, dict): + label = {"labels": features["labels"]} + del features["labels"] + fea = features + else: + raise TypeError else: fea = features diff --git a/propeller/tools/ckpt_inspector.py b/propeller/tools/ckpt_inspector.py index dc2d664..505fccc 100644 --- a/propeller/tools/ckpt_inspector.py +++ b/propeller/tools/ckpt_inspector.py @@ -82,6 +82,38 @@ def parse(filename): return arr +def serialize(arr, filename): + with open(filename, 'wb') as f: + write = lambda fmt, data: f.write(struct.pack(fmt, data)) + write('I', 0) + write('Q', 0) + write('I', 0) + proto = framework_pb2.VarType.TensorDesc() + if arr.dtype == np.float32: + proto.data_type = framework_pb2.VarType.FP32 + dtype = 'f' + elif arr.dtype == np.int64: + proto.data_type = framework_pb2.VarType.INT64 + dtype = 'q' + elif arr.dtype == np.int32: + proto.data_type = framework_pb2.VarType.INT32 + dtype = 'i' + elif arr.dtype == np.int8: + proto.data_type = framework_pb2.VarType.INT8 + dtype = 'B' + elif arr.dtype == np.float16: + proto.data_type = framework_pb2.VarType.FP16 + dtype = 'H' + else: + raise RuntimeError('Unknown dtype %s' % proto.data_type) + proto.dims.extend(arr.shape) + proto_data = proto.SerializeToString() + write('i', len(proto_data)) + f.write(proto_data) + data = struct.pack('%d%s' % (arr.size, dtype), *arr.flatten().tolist()) + f.write(data) + + def show(arr): print(repr(arr)) @@ -111,8 +143,12 @@ if __name__ == '__main__': parser.add_argument('file_or_dir', type=str) parser.add_argument('-t', "--to", type=str, default=None) parser.add_argument('-v', "--verbose", action='store_true') + parser.add_argument('--print_items', type=int, default=None) args = parser.parse_args() + if args.print_items is not None: + np.set_printoptions(edgeitems=args.print_items) + files = list_dir(args.file_or_dir) parsed_arr = map(parse, files) if args.mode == 'show': diff --git a/propeller/types.py b/propeller/types.py index 040beab..a1103e1 100644 --- a/propeller/types.py +++ b/propeller/types.py @@ -94,7 +94,8 @@ TextoneWarmStartSetting = namedtuple('TextoneWarmStartSetting', ['from_dir']) RunConfig = namedtuple('RunConfig', [ 'model_dir', 'run_steps', 'max_steps', 'save_steps', 'eval_steps', - 'eval_max_steps', 'skip_steps', 'log_steps', 'max_ckpt', 'shit' + 'eval_max_steps', 'skip_steps', 'log_steps', 'max_ckpt', 'shit', + 'log_prefix' ]) RunConfig.__new__.__defaults__ = (None, ) * len(RunConfig._fields) diff --git a/requirements.txt b/requirements.txt index 6ad03e8..f2894e6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ sentencepiece==0.1.8 jieba==0.39 visualdl>=2.0.0b7 pathlib2>=2.3.2 +tqdm>=4.32.2 diff --git a/setup.py b/setup.py index 9251c36..1a1ef21 100644 --- a/setup.py +++ b/setup.py @@ -19,10 +19,9 @@ from io import open with open("README.md", "r", encoding='utf-8') as fh: long_description = fh.read() - setuptools.setup( - name="paddle-ernie", # Replace with your own username - version="0.0.5dev1", + name="paddle-ernie", # Replace with your own username + version="0.1.0dev1", author="Baidu Ernie Team", author_email="ernieernie.team@gmail.com", description="A pretrained NLP model for every NLP tasks", @@ -34,7 +33,7 @@ setuptools.setup( 'requests', 'tqdm', 'pathlib2', - ], + ], classifiers=[ 'Intended Audience :: Developers', 'License :: OSI Approved :: Apache Software License', @@ -43,5 +42,4 @@ setuptools.setup( 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', - ], -) + ], ) -- GitLab