diff --git a/demo/ernie-classification/README.md b/demo/ernie-classification/README.md index 4a37d1008625a5b5ddabad86aba0b03b9732ecbb..016038173b122675f62d575fdd4e0ff5c3c1fc68 100644 --- a/demo/ernie-classification/README.md +++ b/demo/ernie-classification/README.md @@ -1,5 +1,108 @@ # ERNIE Classification -本示例如果使用PaddleHub Finetune API快速的完成Transformer类模型ERNIE或BERT完成文本分类任务。 +本示例将展示如何使用PaddleHub Finetune API利用ERNIE完成分类任务。 +其中分类任务可以分为两大类 +* 单句分类 + - 中文情感分析任务 ChnSentiCorp + + +* 句对分类 + - 语义相似度 LCQMC + - 检索式问答任务 NLPCC-DBQA + +## 如何开始Finetune + +在完成安装PaddlePaddle与PaddleHub后,通过执行脚本`sh run_sentiment_cls.sh`即可开始使用ERNIE对ChnSentiCorp数据集进行Finetune。 + +其中脚本参数说明如下: + +```bash +--batch_size: 批处理大小,请结合显存情况进行调整,若出现显存不足错误,请调低这一参数值 +--weight_decay: +--checkpoint_dir: 模型保存路径,PaddleHub会自动保存验证集上表现最好的模型 +--num_epoch: Finetune迭代的轮数 +--max_seq_len: ERNIE模型使用的最大序列长度,最大不能超过512, 若出现显存不足错误,请调低这一参数 +``` + +## 代码步骤 + +使用PaddleHub Finetune API进行Finetune可以分为一下4个步骤 + +### Step1: 加载预训练模型 + +```python +module = hub.Module(name="ernie") +inputs, outputs, program = module.context(trainable=True, max_seq_len=128) +``` +其中最大序列长度`max_seq_len`是可以调整的参数,建议值128,根据任务文本长度不同可以调整该值,但最大不超过512。 + +如果想尝试BERT模型,例如BERT中文模型,只需要更换Module中的参数即可. +PaddleHub除了ERNIE,还提供以下BERT模型: + +BERT模型名 | PaddleHub Module name +---------------------------------- | :------: +BERT-Base, Uncased | bert_uncased_L-12_H-768_A-12 +BERT-Large, Uncased | bert_uncased_L-24_H-1024_A-16 +BERT-Base, Cased | bert_cased_L-12_H-768_A-12 +BERT-Large, Cased | bert_cased_L-24_H-1024_A-16 +BERT-Base, Multilingual Cased | bert_multi_cased_L-12_H-768_A-12 +BERT-Base, Chinese | bert_chinese_L-12_H-768_A-12 + + +```python +# 更换name参数即可无缝切换BERT中文模型 +module = hub.Module(name="bert_chinese_L-12_H-768_A-12") +``` + +### Step2: 准备数据集并使用ClassifyReader读取数据 +```python +reader = hub.reader.ClassifyReader( + dataset=hub.dataset.ChnSentiCorp(), + vocab_path=module.get_vocab_path(), + max_seq_len=128) +``` +`hub.dataset.ChnSentiCorp()` 会自动从网络下载数据集并解压到用户目录下.paddlehub/dataset目录 + +`module.get_vaocab_path()` 会返回ERNIE/BERT模型对应的词表 + +`max_seq_len`需要与Step1中context接口传入的序列长度保持一致 + +ClassifyReader中的`data_generator`会自动按照模型对应词表对数据进行切词,以迭代器的方式返回ERNIE/BERT所需要的Tensor格式,包括`input_ids`,`position_ids`,`segment_id`与序列对应的mask `input_mask`. + + +### Step3: 构建网络并创建分类迁移任务 +```python +with fluid.program_guard(program): # NOTE: 必须使用fluid.program_guard接口传入Module返回的预训练模型program + label = fluid.layers.data(name="label", shape=[1], dtype='int64') + + pooled_output = outputs["pooled_output"] + + feed_list = [ + inputs["input_ids"].name, inputs["position_ids"].name, + inputs["segment_ids"].name, inputs["input_mask"].name, label.name + ] + + cls_task = hub.create_text_classification_task( + feature=pooled_output, label=label, num_classes=reader.get_num_labels()) +``` +**NOTE:** 基于预训练模型的迁移学习网络搭建,必须在`with fluid.program_gurad()`作用域内组件网络 +1. `outputs["pooled_output"]`返回了ERNIE/BERT模型对应的[CLS]向量,可以用于句子或句对的特征表达。 +2. `feed_list`中的inputs参数指名了ERNIE/BERT中的输入tensor,以及label,与ClassifyReader返回的结果一致。 +3. `create_text_classification_task`通过输入特征,label与迁移的类别数,可以生成适用于文本分类的迁移任务`cls_task` + +### Step4:选择优化策略并开始Finetune + +```python +strategy = hub.BERTFinetuneStrategy( + weight_decay=0.01, + learning_rate=5e-5, + warmup_strategy="linear_warmup_decay", +) + +config = hub.RunConfig(use_cuda=True, num_epoch=3, batch_size=32, strategy=strategy) + +hub.finetune_and_eval(task=cls_task, data_reader=reader, feed_list=feed_list, config=config) +``` +针对ERNIE与BERT类任务,PaddleHub封装了适合这一任务的迁移学习优化策略。用户可以通过配置学习率,权重 diff --git a/demo/ernie-classification/ernie_tiny_demo.py b/demo/ernie-classification/ernie_tiny_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..792d141af605cd3e69ffa90cb36d8f6694c3ff64 --- /dev/null +++ b/demo/ernie-classification/ernie_tiny_demo.py @@ -0,0 +1,35 @@ +import paddle.fluid as fluid +import paddlehub as hub + +module = hub.Module(name="ernie") +inputs, outputs, program = module.context(trainable=True, max_seq_len=128) + +reader = hub.reader.ClassifyReader( + dataset=hub.dataset.ChnSentiCorp(), + vocab_path=module.get_vocab_path(), + max_seq_len=128) + +with fluid.program_guard(program): + label = fluid.layers.data(name="label", shape=[1], dtype='int64') + + pooled_output = outputs["pooled_output"] + + feed_list = [ + inputs["input_ids"].name, inputs["position_ids"].name, + inputs["segment_ids"].name, inputs["input_mask"].name, label.name + ] + + cls_task = hub.create_text_classification_task( + pooled_output, label, num_classes=reader.get_num_labels()) + + strategy = hub.BERTFinetuneStrategy( + weight_decay=0.01, + learning_rate=5e-5, + warmup_strategy="linear_warmup_decay", + ) + + config = hub.RunConfig( + use_cuda=True, num_epoch=3, batch_size=32, strategy=strategy) + + hub.finetune_and_eval( + task=cls_task, data_reader=reader, feed_list=feed_list, config=config) diff --git a/demo/ernie-classification/question_answering.py b/demo/ernie-classification/question_answering.py index a794a1739b8211fdc93c48404b2abfbb132117ed..3429b488860427bc74df8ea6adf7bed97c30dcb9 100644 --- a/demo/ernie-classification/question_answering.py +++ b/demo/ernie-classification/question_answering.py @@ -13,16 +13,8 @@ # limitations under the License. """Finetuning on classification task """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import time import argparse -import numpy as np -import paddle import paddle.fluid as fluid import paddlehub as hub @@ -30,7 +22,6 @@ import paddlehub as hub parser = argparse.ArgumentParser(__doc__) parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.") parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.") -parser.add_argument("--hub_module_dir", type=str, default=None, help="PaddleHub module directory") parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.") parser.add_argument("--data_dir", type=str, default=None, help="Path to training data.") parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") @@ -46,9 +37,8 @@ if __name__ == '__main__': trainable=True, max_seq_len=args.max_seq_len) # Step2: Download dataset and use ClassifyReader to read dataset - dataset = hub.dataset.NLPCC_DBQA() reader = hub.reader.ClassifyReader( - dataset=dataset, + dataset=hub.dataset.NLPCC_DBQA(), vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len) num_labels = len(reader.get_labels()) diff --git a/demo/ernie-classification/question_matching.py b/demo/ernie-classification/question_matching.py index b288572409d221e3b7d787ad0b5ed62c9a8df72d..af2a0f6492c4ea169fec7906e49231d26d65acc2 100644 --- a/demo/ernie-classification/question_matching.py +++ b/demo/ernie-classification/question_matching.py @@ -13,16 +13,8 @@ # limitations under the License. """Finetuning on classification task """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import time import argparse -import numpy as np -import paddle import paddle.fluid as fluid import paddlehub as hub @@ -30,7 +22,6 @@ import paddlehub as hub parser = argparse.ArgumentParser(__doc__) parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.") parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.") -parser.add_argument("--hub_module_dir", type=str, default=None, help="PaddleHub module directory") parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.") parser.add_argument("--data_dir", type=str, default=None, help="Path to training data.") parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") @@ -46,9 +37,8 @@ if __name__ == '__main__': trainable=True, max_seq_len=args.max_seq_len) # Step2: Download dataset and use ClassifyReader to read dataset - dataset = hub.dataset.LCQMC() reader = hub.reader.ClassifyReader( - dataset=dataset, + dataset=hub.dataset.LCQMC(), vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len) num_labels = len(reader.get_labels()) diff --git a/demo/ernie-classification/run_question_matching.sh b/demo/ernie-classification/run_question_matching.sh index 2230d8e0b713fc285f083dd3fb26d08a98d744df..7f349b942f765610cdcc60f4d2877cabe6afe90a 100644 --- a/demo/ernie-classification/run_question_matching.sh +++ b/demo/ernie-classification/run_question_matching.sh @@ -1,4 +1,4 @@ -export CUDA_VISIBLE_DEVICES=0 +export CUDA_VISIBLE_DEVICES=5 CKPT_DIR="./ckpt_question_matching" python -u question_matching.py \ diff --git a/demo/ernie-classification/run_sentiment_cls.sh b/demo/ernie-classification/run_sentiment_cls.sh index 34203b1ae2e4d9c6230610f2b31759abb8c57930..246a4fa278dd132bc600435ba756631c04de5abc 100644 --- a/demo/ernie-classification/run_sentiment_cls.sh +++ b/demo/ernie-classification/run_sentiment_cls.sh @@ -1,4 +1,4 @@ -export CUDA_VISIBLE_DEVICES=3 +export CUDA_VISIBLE_DEVICES=5 CKPT_DIR="./ckpt_sentiment_cls" python -u sentiment_cls.py \ diff --git a/demo/ernie-classification/sentiment_cls.py b/demo/ernie-classification/sentiment_cls.py index 3d2c5b82ed17501f7017f5862c67dc73e772d40f..ec6b1b4be08d271effaa871c9d9a93e15192deab 100644 --- a/demo/ernie-classification/sentiment_cls.py +++ b/demo/ernie-classification/sentiment_cls.py @@ -13,16 +13,8 @@ # limitations under the License. """Finetuning on classification task """ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import time import argparse -import numpy as np -import paddle import paddle.fluid as fluid import paddlehub as hub @@ -30,7 +22,6 @@ import paddlehub as hub parser = argparse.ArgumentParser(__doc__) parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.") parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.") -parser.add_argument("--hub_module_dir", type=str, default=None, help="PaddleHub module directory") parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.") parser.add_argument("--data_dir", type=str, default=None, help="Path to training data.") parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") @@ -46,12 +37,10 @@ if __name__ == '__main__': trainable=True, max_seq_len=args.max_seq_len) # Step2: Download dataset and use ClassifyReader to read dataset - dataset = hub.dataset.ChnSentiCorp() reader = hub.reader.ClassifyReader( - dataset=dataset, + dataset=hub.dataset.ChnSentiCorp(), vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len) - num_labels = len(reader.get_labels()) # Step3: construct transfer learning network with fluid.program_guard(program): @@ -69,7 +58,7 @@ if __name__ == '__main__': ] # Define a classfication finetune task by PaddleHub's API cls_task = hub.create_text_classification_task( - pooled_output, label, num_classes=num_labels) + pooled_output, label, num_classes=reader.get_num_labels()) # Step4: Select finetune strategy, setup config and finetune strategy = hub.BERTFinetuneStrategy( diff --git a/demo/ernie-seq-labeling/run_sequence_labeling.sh b/demo/ernie-seq-labeling/run_sequence_labeling.sh index 89aa22f683f97aedb14d37d46470d12fe0176651..ddd24d2a3d72318647e9ba4092cffccfb3498acc 100644 --- a/demo/ernie-seq-labeling/run_sequence_labeling.sh +++ b/demo/ernie-seq-labeling/run_sequence_labeling.sh @@ -1,4 +1,4 @@ -export CUDA_VISIBLE_DEVICES=0 +export CUDA_VISIBLE_DEVICES=6 CKPT_DIR="./ckpt_sequence_labeling" diff --git a/demo/ernie-seq-labeling/sequence_labeling.py b/demo/ernie-seq-labeling/sequence_labeling.py index 39c9e91c5dc7148e370dc425f28faeca602c9978..702801149e9cf082eb8da2019fa4dc6ecd57d91e 100644 --- a/demo/ernie-seq-labeling/sequence_labeling.py +++ b/demo/ernie-seq-labeling/sequence_labeling.py @@ -13,16 +13,8 @@ # limitations under the License. """Finetuning on sequence labeling task.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import time import argparse -import numpy as np -import paddle import paddle.fluid as fluid import paddlehub as hub @@ -30,7 +22,6 @@ import paddlehub as hub parser = argparse.ArgumentParser(__doc__) parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.") parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.") -parser.add_argument("--hub_module_dir", type=str, default=None, help="PaddleHub module directory") parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.") parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") @@ -46,9 +37,8 @@ if __name__ == '__main__': trainable=True, max_seq_len=args.max_seq_len) # Step2: Download dataset and use SequenceLabelReader to read dataset - dataset = hub.dataset.MSRA_NER() reader = hub.reader.SequenceLabelReader( - dataset=dataset, + dataset=hub.dataset.MSRA_NER(), vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len) @@ -60,7 +50,6 @@ if __name__ == '__main__': name="label", shape=[args.max_seq_len, 1], dtype='int64') seq_len = fluid.layers.data(name="seq_len", shape=[1], dtype='int64') - # Use "pooled_output" for classification tasks on an entire sentence. # Use "sequence_output" for token-level output. sequence_output = outputs["sequence_output"] @@ -93,6 +82,7 @@ if __name__ == '__main__': batch_size=args.batch_size, checkpoint_dir=args.checkpoint_dir, strategy=strategy) + # Finetune and evaluate model by PaddleHub's API # will finish training, evaluation, testing, save model automatically hub.finetune_and_eval( diff --git a/paddlehub/reader/nlp_reader.py b/paddlehub/reader/nlp_reader.py index 80a6c3c4ac6a0495b35301f59011d2ae549cd340..b605886163927370170778b8541b215684920a0d 100644 --- a/paddlehub/reader/nlp_reader.py +++ b/paddlehub/reader/nlp_reader.py @@ -80,6 +80,9 @@ class BaseReader(object): """Gets the list of labels for this data set.""" return self.dataset.get_labels() + def get_num_labels(self): + return len(self.dataset.get_labels()) + def get_train_progress(self): """Gets progress for training phase.""" return self.current_example, self.current_epoch