From 2355a54ffc94aacf934248300e230815034a3448 Mon Sep 17 00:00:00 2001 From: Steffy-zxf Date: Tue, 12 May 2020 19:28:24 +0800 Subject: [PATCH] add predefine network usage demo --- demo/text_classification/predict.py | 9 +- .../predict_predefine_net.py | 99 +++++++++++++++++++ demo/text_classification/run_cls.sh | 52 ++++++++++ ...classifier.sh => run_cls_predefine_net.sh} | 4 +- demo/text_classification/run_predict.sh | 10 +- .../run_predict_predefine_net.sh | 11 +++ demo/text_classification/text_cls.py | 97 ++++++++++++++++++ ...lassifier.py => text_cls_predefine_net.py} | 0 8 files changed, 268 insertions(+), 14 deletions(-) create mode 100644 demo/text_classification/predict_predefine_net.py create mode 100644 demo/text_classification/run_cls.sh rename demo/text_classification/{run_classifier.sh => run_cls_predefine_net.sh} (97%) create mode 100644 demo/text_classification/run_predict_predefine_net.sh create mode 100644 demo/text_classification/text_cls.py rename demo/text_classification/{text_classifier.py => text_cls_predefine_net.py} (100%) diff --git a/demo/text_classification/predict.py b/demo/text_classification/predict.py index e53cf2b8..40afd1ac 100644 --- a/demo/text_classification/predict.py +++ b/demo/text_classification/predict.py @@ -60,7 +60,7 @@ if __name__ == '__main__': # Construct transfer learning network # Use "pooled_output" for classification tasks on an entire sentence. # Use "sequence_output" for token-level output. - token_feature = outputs["sequence_output"] + pooled_output = outputs["pooled_output"] # Setup feed list for data feeder # Must feed all the tensor of module need @@ -80,15 +80,10 @@ if __name__ == '__main__': strategy=hub.AdamWeightDecayStrategy()) # Define a classfication finetune task by PaddleHub's API - # network choice: bilstm, bow, cnn, dpcnn, gru, lstm (PaddleHub pre-defined network) - # If you wanna add network after ERNIE/BERT/RoBERTa/ELECTRA module, - # you must use the outputs["sequence_output"] as the token_feature of TextClassifierTask, - # rather than outputs["pooled_output"], and feature is None cls_task = hub.TextClassifierTask( data_reader=reader, - token_feature=token_feature, + feature=pooled_output, feed_list=feed_list, - network=args.network, num_classes=dataset.num_labels, config=config) diff --git a/demo/text_classification/predict_predefine_net.py b/demo/text_classification/predict_predefine_net.py new file mode 100644 index 00000000..e53cf2b8 --- /dev/null +++ b/demo/text_classification/predict_predefine_net.py @@ -0,0 +1,99 @@ +#coding:utf-8 +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Finetuning on classification task """ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import ast +import numpy as np +import os +import time +import paddle +import paddle.fluid as fluid +import paddlehub as hub + +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") +parser.add_argument("--batch_size", type=int, default=1, help="Total examples' number in batch for training.") +parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") +parser.add_argument("--use_gpu", type=ast.literal_eval, default=False, help="Whether use GPU for finetuning, input should be True or False") +parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=False, help="Whether use data parallel.") +parser.add_argument("--network", type=str, default='bilstm', help="Pre-defined network which was connected after Transformer model, such as ERNIE, BERT ,RoBERTa and ELECTRA.") +args = parser.parse_args() +# yapf: enable. + +if __name__ == '__main__': + # Load Paddlehub ERNIE Tiny pretrained model + module = hub.Module(name="ernie_tiny") + inputs, outputs, program = module.context( + trainable=True, max_seq_len=args.max_seq_len) + + # Download dataset and use accuracy as metrics + # Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC + dataset = hub.dataset.ChnSentiCorp() + + # For ernie_tiny, it use sub-word to tokenize chinese sentence + # If not ernie tiny, sp_model_path and word_dict_path should be set None + reader = hub.reader.ClassifyReader( + dataset=dataset, + vocab_path=module.get_vocab_path(), + max_seq_len=args.max_seq_len, + sp_model_path=module.get_spm_path(), + word_dict_path=module.get_word_dict_path()) + + # Construct transfer learning network + # Use "pooled_output" for classification tasks on an entire sentence. + # Use "sequence_output" for token-level output. + token_feature = outputs["sequence_output"] + + # Setup feed list for data feeder + # Must feed all the tensor of module need + feed_list = [ + inputs["input_ids"].name, + inputs["position_ids"].name, + inputs["segment_ids"].name, + inputs["input_mask"].name, + ] + + # Setup runing config for PaddleHub Finetune API + config = hub.RunConfig( + use_data_parallel=args.use_data_parallel, + use_cuda=args.use_gpu, + batch_size=args.batch_size, + checkpoint_dir=args.checkpoint_dir, + strategy=hub.AdamWeightDecayStrategy()) + + # Define a classfication finetune task by PaddleHub's API + # network choice: bilstm, bow, cnn, dpcnn, gru, lstm (PaddleHub pre-defined network) + # If you wanna add network after ERNIE/BERT/RoBERTa/ELECTRA module, + # you must use the outputs["sequence_output"] as the token_feature of TextClassifierTask, + # rather than outputs["pooled_output"], and feature is None + cls_task = hub.TextClassifierTask( + data_reader=reader, + token_feature=token_feature, + feed_list=feed_list, + network=args.network, + num_classes=dataset.num_labels, + config=config) + + # Data to be prdicted + data = [["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"], ["交通方便;环境很好;服务态度很好 房间较小"], + ["19天硬盘就罢工了~~~算上运来的一周都没用上15天~~~可就是不能换了~~~唉~~~~你说这算什么事呀~~~"]] + + print(cls_task.predict(data=data, return_result=True)) diff --git a/demo/text_classification/run_cls.sh b/demo/text_classification/run_cls.sh new file mode 100644 index 00000000..67de0395 --- /dev/null +++ b/demo/text_classification/run_cls.sh @@ -0,0 +1,52 @@ +export FLAGS_eager_delete_tensor_gb=0.0 +export CUDA_VISIBLE_DEVICES=0 + +CKPT_DIR="./ckpt_chnsenticorp" + +python -u text_cls.py \ + --batch_size=24 \ + --use_gpu=True \ + --checkpoint_dir=${CKPT_DIR} \ + --learning_rate=5e-5 \ + --weight_decay=0.01 \ + --max_seq_len=128 \ + --warmup_proportion=0.1 \ + --num_epoch=3 \ + --use_data_parallel=True + +# The sugguested hyper parameters for difference task +# for ChineseGLUE: +# TNews: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5 +# LCQMC: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5 +# XNLI_zh: batch_size=32, weight_decay=0, num_epoch=2, max_seq_len=128, lr=5e-5 +# INEWS: batch_size=4, weight_decay=0, num_epoch=3, max_seq_len=512, lr=5e-5 +# DRCD: see demo: reading-comprehension +# CMRC2018: see demo: reading-comprehension +# BQ: batch_size=32, weight_decay=0, num_epoch=2, max_seq_len=100, lr=1e-5 +# MSRANER: see demo: sequence-labeling +# THUCNEWS: batch_size=8, weight_decay=0, num_epoch=2, max_seq_len=512, lr=5e-5 +# IFLYTEKDATA: batch_size=16, weight_decay=0, num_epoch=5, max_seq_len=256, lr=1e-5 + +# for other tasks: +# ChnSentiCorp: batch_size=24, weight_decay=0.01, num_epoch=3, max_seq_len=128, lr=5e-5 +# NLPCC_DBQA: batch_size=8, weight_decay=0.01, num_epoch=3, max_seq_len=512, lr=2e-5 +# LCQMC: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=2e-5 +# QQP: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5 +# QNLI: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5 +# SST-2: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5 +# CoLA: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5 +# MRPC: batch_size=32, weight_decay=0.01, num_epoch=3, max_seq_len=128, lr=5e-5 +# RTE: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=3e-5 +# MNLI: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5 +# Specify the matched/mismatched dev and test dataset with an underscore. +# mnli_m or mnli: dev and test in matched dataset. +# mnli_mm: dev and test in mismatched dataset. +# The difference can be seen in https://www.nyu.edu/projects/bowman/multinli/paper.pdf. +# If you are not sure which one to pick, just use mnli or mnli_m. +# XNLI: batch_size=32, weight_decay=0, num_epoch=3, max_seq_len=128, lr=5e-5 +# Specify the language with an underscore like xnli_zh. +# ar- Arabic bg- Bulgarian de- German +# el- Greek en- English es- Spanish +# fr- French hi- Hindi ru- Russian +# sw- Swahili th- Thai tr- Turkish +# ur- Urdu vi- Vietnamese zh- Chinese (Simplified) diff --git a/demo/text_classification/run_classifier.sh b/demo/text_classification/run_cls_predefine_net.sh similarity index 97% rename from demo/text_classification/run_classifier.sh rename to demo/text_classification/run_cls_predefine_net.sh index afb9e712..f8dab3b9 100644 --- a/demo/text_classification/run_classifier.sh +++ b/demo/text_classification/run_cls_predefine_net.sh @@ -1,9 +1,9 @@ export FLAGS_eager_delete_tensor_gb=0.0 export CUDA_VISIBLE_DEVICES=0 -CKPT_DIR="./ckpt_chnsenticorp" +CKPT_DIR="./ckpt_chnsenticorp_predefine_net" -python -u text_classifier.py \ +python -u text_cls_predefine_net.py \ --batch_size=24 \ --use_gpu=True \ --checkpoint_dir=${CKPT_DIR} \ diff --git a/demo/text_classification/run_predict.sh b/demo/text_classification/run_predict.sh index a1a9b76a..c4c0d600 100644 --- a/demo/text_classification/run_predict.sh +++ b/demo/text_classification/run_predict.sh @@ -3,8 +3,8 @@ export CUDA_VISIBLE_DEVICES=0 CKPT_DIR="./ckpt_chnsenticorp" -python -u predict.py --checkpoint_dir=$CKPT_DIR \ - --max_seq_len=128 \ - --use_gpu=True \ - --batch_size=24 \ - --network=bilstm +python -u predict.py \ + --checkpoint_dir=$CKPT_DIR \ + --max_seq_len=128 \ + --use_gpu=True \ + --batch_size=24 diff --git a/demo/text_classification/run_predict_predefine_net.sh b/demo/text_classification/run_predict_predefine_net.sh new file mode 100644 index 00000000..a29e713f --- /dev/null +++ b/demo/text_classification/run_predict_predefine_net.sh @@ -0,0 +1,11 @@ +export FLAGS_eager_delete_tensor_gb=0.0 +export CUDA_VISIBLE_DEVICES=0 + +CKPT_DIR="./ckpt_chnsenticorp_predefine_net" + +python -u predict_predefine_net.py \ + --checkpoint_dir=$CKPT_DIR \ + --max_seq_len=128 \ + --use_gpu=True \ + --batch_size=24 \ + --network=bilstm diff --git a/demo/text_classification/text_cls.py b/demo/text_classification/text_cls.py new file mode 100644 index 00000000..e221cdc7 --- /dev/null +++ b/demo/text_classification/text_cls.py @@ -0,0 +1,97 @@ +#coding:utf-8 +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Finetuning on classification task """ + +import argparse +import ast +import paddlehub as hub + +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.") +parser.add_argument("--use_gpu", type=ast.literal_eval, default=True, help="Whether use GPU for finetuning, input should be True or False") +parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.") +parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.") +parser.add_argument("--warmup_proportion", type=float, default=0.1, help="Warmup proportion params for warmup strategy") +parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") +parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") +parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.") +parser.add_argument("--use_data_parallel", type=ast.literal_eval, default=False, help="Whether use data parallel.") +args = parser.parse_args() +# yapf: enable. + +if __name__ == '__main__': + + # Load Paddlehub ERNIE Tiny pretrained model + module = hub.Module(name="ernie_tiny") + inputs, outputs, program = module.context( + trainable=True, max_seq_len=args.max_seq_len) + + # Download dataset and use accuracy as metrics + # Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC + # metric should be acc, f1 or matthews + dataset = hub.dataset.ChnSentiCorp() + metrics_choices = ["acc"] + + # For ernie_tiny, it use sub-word to tokenize chinese sentence + # If not ernie tiny, sp_model_path and word_dict_path should be set None + reader = hub.reader.ClassifyReader( + dataset=dataset, + vocab_path=module.get_vocab_path(), + max_seq_len=args.max_seq_len, + sp_model_path=module.get_spm_path(), + word_dict_path=module.get_word_dict_path()) + + # Construct transfer learning network + # Use "pooled_output" for classification tasks on an entire sentence. + # Use "sequence_output" for token-level output. + pooled_output = outputs["pooled_output"] + + # Setup feed list for data feeder + # Must feed all the tensor of module need + feed_list = [ + inputs["input_ids"].name, + inputs["position_ids"].name, + inputs["segment_ids"].name, + inputs["input_mask"].name, + ] + + # Select finetune strategy, setup config and finetune + strategy = hub.AdamWeightDecayStrategy( + warmup_proportion=args.warmup_proportion, + weight_decay=args.weight_decay, + learning_rate=args.learning_rate) + + # Setup runing config for PaddleHub Finetune API + config = hub.RunConfig( + use_data_parallel=args.use_data_parallel, + use_cuda=args.use_gpu, + num_epoch=args.num_epoch, + batch_size=args.batch_size, + checkpoint_dir=args.checkpoint_dir, + strategy=strategy) + + # Define a classfication finetune task by PaddleHub's API + cls_task = hub.TextClassifierTask( + data_reader=reader, + feature=pooled_output, + feed_list=feed_list, + num_classes=dataset.num_labels, + config=config, + metrics_choices=metrics_choices) + + # Finetune and evaluate by PaddleHub's API + # will finish training, evaluation, testing, save model automatically + cls_task.finetune_and_eval() diff --git a/demo/text_classification/text_classifier.py b/demo/text_classification/text_cls_predefine_net.py similarity index 100% rename from demo/text_classification/text_classifier.py rename to demo/text_classification/text_cls_predefine_net.py -- GitLab