From dae2ef9baef4d176ed458568b81eaf7ede2d468d Mon Sep 17 00:00:00 2001 From: Zeyu Chen Date: Wed, 10 Apr 2019 01:22:32 +0800 Subject: [PATCH] add more ernie classfication task and dataset --- .../question_answering.py | 97 +++++++++++++++++++ .../ernie-classification/question_matching.py | 97 +++++++++++++++++++ .../run_question_answering.sh | 10 ++ .../run_question_matching.sh | 10 ++ ...ntune_with_hub.sh => run_sentiment_cls.sh} | 4 +- ...{finetune_with_hub.py => sentiment_cls.py} | 3 +- paddlehub/dataset/__init__.py | 1 + paddlehub/dataset/chnsenticorp.py | 17 ++-- paddlehub/dataset/dataset.py | 24 +++++ paddlehub/dataset/lcqmc.py | 84 ++++++++++++++++ paddlehub/dataset/msra_ner.py | 9 +- paddlehub/dataset/nlpcc_dbqa.py | 84 ++++++++++++++++ paddlehub/finetune/evaluate.py | 2 + paddlehub/reader/nlp_reader.py | 6 +- 14 files changed, 436 insertions(+), 12 deletions(-) create mode 100644 demo/ernie-classification/question_answering.py create mode 100644 demo/ernie-classification/question_matching.py create mode 100644 demo/ernie-classification/run_question_answering.sh create mode 100644 demo/ernie-classification/run_question_matching.sh rename demo/ernie-classification/{run_fintune_with_hub.sh => run_sentiment_cls.sh} (81%) rename demo/ernie-classification/{finetune_with_hub.py => sentiment_cls.py} (98%) create mode 100644 paddlehub/dataset/lcqmc.py create mode 100644 paddlehub/dataset/nlpcc_dbqa.py diff --git a/demo/ernie-classification/question_answering.py b/demo/ernie-classification/question_answering.py new file mode 100644 index 00000000..b09ee782 --- /dev/null +++ b/demo/ernie-classification/question_answering.py @@ -0,0 +1,97 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Finetuning on classification task """ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import time +import argparse +import numpy as np + +import paddle +import paddle.fluid as fluid +import paddlehub as hub + +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.") +parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.") +parser.add_argument("--hub_module_dir", type=str, default=None, help="PaddleHub module directory") +parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.") +parser.add_argument("--data_dir", type=str, default=None, help="Path to training data.") +parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") +parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") +parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.") +args = parser.parse_args() +# yapf: enable. + +if __name__ == '__main__': + # Select a finetune strategy + strategy = hub.BERTFinetuneStrategy( + weight_decay=args.weight_decay, + learning_rate=args.learning_rate, + warmup_strategy="linear_warmup_decay", + ) + + # Setup runing config for PaddleHub Finetune API + config = hub.RunConfig( + eval_interval=100, + use_cuda=True, + num_epoch=args.num_epoch, + batch_size=args.batch_size, + checkpoint_dir=args.checkpoint_dir, + strategy=strategy) + + # loading Paddlehub ERNIE pretrained model + module = hub.Module(name="ernie") + + # Sentence classification dataset reader + reader = hub.reader.ClassifyReader( + dataset=hub.dataset.NLPCC_DBQA(), # download NLPCC_DBQA dataset + vocab_path=module.get_vocab_path(), + max_seq_len=args.max_seq_len) + + num_labels = len(reader.get_labels()) + + input_dict, output_dict, program = module.context( + sign_name="tokens", trainable=True, max_seq_len=args.max_seq_len) + + with fluid.program_guard(program): + label = fluid.layers.data(name="label", shape=[1], dtype='int64') + + # Use "pooled_output" for classification tasks on an entire sentence. + # Use "sequence_outputs" for token-level output. + pooled_output = output_dict["pooled_output"] + + # Setup feed list for data feeder + # Must feed all the tensor of ERNIE's module need + feed_list = [ + input_dict["input_ids"].name, input_dict["position_ids"].name, + input_dict["segment_ids"].name, input_dict["input_mask"].name, + label.name + ] + # Define a classfication finetune task by PaddleHub's API + cls_task = hub.create_text_classification_task( + pooled_output, label, num_classes=num_labels) + + # Finetune and evaluate by PaddleHub's API + # will finish training, evaluation, testing, save model automatically + hub.finetune_and_eval( + task=cls_task, + data_reader=reader, + feed_list=feed_list, + config=config) diff --git a/demo/ernie-classification/question_matching.py b/demo/ernie-classification/question_matching.py new file mode 100644 index 00000000..64ee1f79 --- /dev/null +++ b/demo/ernie-classification/question_matching.py @@ -0,0 +1,97 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Finetuning on classification task """ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import time +import argparse +import numpy as np + +import paddle +import paddle.fluid as fluid +import paddlehub as hub + +# yapf: disable +parser = argparse.ArgumentParser(__doc__) +parser.add_argument("--num_epoch", type=int, default=3, help="Number of epoches for fine-tuning.") +parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.") +parser.add_argument("--hub_module_dir", type=str, default=None, help="PaddleHub module directory") +parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay rate for L2 regularizer.") +parser.add_argument("--data_dir", type=str, default=None, help="Path to training data.") +parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory to model checkpoint") +parser.add_argument("--max_seq_len", type=int, default=512, help="Number of words of the longest seqence.") +parser.add_argument("--batch_size", type=int, default=32, help="Total examples' number in batch for training.") +args = parser.parse_args() +# yapf: enable. + +if __name__ == '__main__': + # Select a finetune strategy + strategy = hub.BERTFinetuneStrategy( + weight_decay=args.weight_decay, + learning_rate=args.learning_rate, + warmup_strategy="linear_warmup_decay", + ) + + # Setup runing config for PaddleHub Finetune API + config = hub.RunConfig( + eval_interval=100, + use_cuda=True, + num_epoch=args.num_epoch, + batch_size=args.batch_size, + checkpoint_dir=args.checkpoint_dir, + strategy=strategy) + + # loading Paddlehub ERNIE pretrained model + module = hub.Module(name="ernie") + + # Sentence classification dataset reader + reader = hub.reader.ClassifyReader( + dataset=hub.dataset.LCQMC(), # download LCQMC dataset + vocab_path=module.get_vocab_path(), + max_seq_len=args.max_seq_len) + + num_labels = len(reader.get_labels()) + + input_dict, output_dict, program = module.context( + sign_name="tokens", trainable=True, max_seq_len=args.max_seq_len) + + with fluid.program_guard(program): + label = fluid.layers.data(name="label", shape=[1], dtype='int64') + + # Use "pooled_output" for classification tasks on an entire sentence. + # Use "sequence_outputs" for token-level output. + pooled_output = output_dict["pooled_output"] + + # Setup feed list for data feeder + # Must feed all the tensor of ERNIE's module need + feed_list = [ + input_dict["input_ids"].name, input_dict["position_ids"].name, + input_dict["segment_ids"].name, input_dict["input_mask"].name, + label.name + ] + # Define a classfication finetune task by PaddleHub's API + cls_task = hub.create_text_classification_task( + pooled_output, label, num_classes=num_labels) + + # Finetune and evaluate by PaddleHub's API + # will finish training, evaluation, testing, save model automatically + hub.finetune_and_eval( + task=cls_task, + data_reader=reader, + feed_list=feed_list, + config=config) diff --git a/demo/ernie-classification/run_question_answering.sh b/demo/ernie-classification/run_question_answering.sh new file mode 100644 index 00000000..56fa22ad --- /dev/null +++ b/demo/ernie-classification/run_question_answering.sh @@ -0,0 +1,10 @@ +export CUDA_VISIBLE_DEVICES=3 + +CKPT_DIR="./ckpt_dbqa" +python -u question_answering.py \ + --batch_size 8 \ + --weight_decay 0.01 \ + --checkpoint_dir $CKPT_DIR \ + --num_epoch 3 \ + --max_seq_len 512 \ + --learning_rate 2e-5 diff --git a/demo/ernie-classification/run_question_matching.sh b/demo/ernie-classification/run_question_matching.sh new file mode 100644 index 00000000..4780434c --- /dev/null +++ b/demo/ernie-classification/run_question_matching.sh @@ -0,0 +1,10 @@ +export CUDA_VISIBLE_DEVICES=0 + +CKPT_DIR="./ckpt_question_matching" +python -u sentiment_cls.py \ + --batch_size 32 \ + --weight_decay 0.00 \ + --checkpoint_dir $CKPT_DIR \ + --num_epoch 3 \ + --max_seq_len 128 \ + --learning_rate 2e-5 diff --git a/demo/ernie-classification/run_fintune_with_hub.sh b/demo/ernie-classification/run_sentiment_cls.sh similarity index 81% rename from demo/ernie-classification/run_fintune_with_hub.sh rename to demo/ernie-classification/run_sentiment_cls.sh index b267e3da..34203b1a 100644 --- a/demo/ernie-classification/run_fintune_with_hub.sh +++ b/demo/ernie-classification/run_sentiment_cls.sh @@ -1,7 +1,7 @@ export CUDA_VISIBLE_DEVICES=3 -CKPT_DIR="./ckpt" -python -u finetune_with_hub.py \ +CKPT_DIR="./ckpt_sentiment_cls" +python -u sentiment_cls.py \ --batch_size 32 \ --weight_decay 0.01 \ --checkpoint_dir $CKPT_DIR \ diff --git a/demo/ernie-classification/finetune_with_hub.py b/demo/ernie-classification/sentiment_cls.py similarity index 98% rename from demo/ernie-classification/finetune_with_hub.py rename to demo/ernie-classification/sentiment_cls.py index 841afb39..c5fd6d49 100644 --- a/demo/ernie-classification/finetune_with_hub.py +++ b/demo/ernie-classification/sentiment_cls.py @@ -49,10 +49,11 @@ if __name__ == '__main__': # Setup runing config for PaddleHub Finetune API config = hub.RunConfig( - eval_interval=10, + eval_interval=100, use_cuda=True, num_epoch=args.num_epoch, batch_size=args.batch_size, + checkpoint_dir=args.checkpoint_dir, strategy=strategy) # loading Paddlehub ERNIE pretrained model diff --git a/paddlehub/dataset/__init__.py b/paddlehub/dataset/__init__.py index b059666e..a8179125 100644 --- a/paddlehub/dataset/__init__.py +++ b/paddlehub/dataset/__init__.py @@ -15,5 +15,6 @@ from .dataset import InputExample, HubDataset from .chnsenticorp import ChnSentiCorp from .msra_ner import MSRA_NER +from .nlpcc_dbqa import NLPCC_DBQA from .dogcat import DogCatDataset as DogCat from .flowers import FlowersDataset as Flowers diff --git a/paddlehub/dataset/chnsenticorp.py b/paddlehub/dataset/chnsenticorp.py index 3cb879dd..9bac2bb4 100644 --- a/paddlehub/dataset/chnsenticorp.py +++ b/paddlehub/dataset/chnsenticorp.py @@ -16,12 +16,12 @@ from collections import namedtuple import os import csv -from paddlehub.dataset import InputExample -from paddlehub.dataset import HubDataset +from paddlehub.dataset import InputExample, HubDataset from paddlehub.common.downloader import default_downloader from paddlehub.common.dir import DATA_HOME +from paddlehub.common.logger import logger -DATA_URL = "https://paddlehub-dataset.bj.bcebos.com/chnsenticorp_data.tar.gz" +DATA_URL = "https://paddlehub-dataset.bj.bcebos.com/chnsenticorp.tar.gz" class ChnSentiCorp(HubDataset): @@ -31,8 +31,12 @@ class ChnSentiCorp(HubDataset): """ def __init__(self): - ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress( - url=DATA_URL, save_path=DATA_HOME, print_progress=True) + self.dataset_dir = os.path.join(DATA_HOME, "chnsenticorp") + if not os.path.exists(self.dataset_dir): + ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress( + url=DATA_URL, save_path=DATA_HOME, print_progress=True) + else: + logger.info("Dataset {} already cached.".format(self.dataset_dir)) self._load_train_examples() self._load_test_examples() @@ -69,6 +73,7 @@ class ChnSentiCorp(HubDataset): reader = csv.reader(f, delimiter="\t", quotechar=quotechar) examples = [] seq_id = 0 + header = next(reader) # skip header for line in reader: example = InputExample( guid=seq_id, label=line[0], text_a=line[1]) @@ -81,4 +86,4 @@ class ChnSentiCorp(HubDataset): if __name__ == "__main__": ds = ChnSentiCorp() for e in ds.get_train_examples(): - print(e) + print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) diff --git a/paddlehub/dataset/dataset.py b/paddlehub/dataset/dataset.py index 2b1577cf..ec13f9c1 100644 --- a/paddlehub/dataset/dataset.py +++ b/paddlehub/dataset/dataset.py @@ -13,6 +13,30 @@ # limitations under the License. +class InputExample(object): + """ + Input data structure of BERT/ERNIE, can satisfy single sequence task like + text classification, sequence lableing; Sequence pair task like dialog + task. + """ + + def __init__(self, guid, text_a, text_b=None, label=None): + """Constructs a InputExample. + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + text_b: (Optional) string. The untokenized text of the second sequence. + Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.text_a = text_a + self.text_b = text_b + self.label = label + + class HubDataset(object): def get_train_examples(self): raise NotImplementedError() diff --git a/paddlehub/dataset/lcqmc.py b/paddlehub/dataset/lcqmc.py new file mode 100644 index 00000000..75d027a1 --- /dev/null +++ b/paddlehub/dataset/lcqmc.py @@ -0,0 +1,84 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import namedtuple +import os +import csv + +from paddlehub.dataset import InputExample, HubDataset +from paddlehub.common.downloader import default_downloader +from paddlehub.common.dir import DATA_HOME +from paddlehub.common.logger import logger + +DATA_URL = "https://paddlehub-dataset.bj.bcebos.com/lcqmc.tar.gz" + + +class NLPCC_DBQA(HubDataset): + def __init__(self): + self.dataset_dir = os.path.join(DATA_HOME, "lcqmc") + if not os.path.exists(self.dataset_dir): + ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress( + url=DATA_URL, save_path=DATA_HOME, print_progress=True) + else: + logger.info("Dataset {} already cached.".format(self.dataset_dir)) + + self._load_train_examples() + self._load_test_examples() + self._load_dev_examples() + + def _load_train_examples(self): + self.train_file = os.path.join(self.dataset_dir, "train.tsv") + self.train_examples = self._read_tsv(self.train_file) + + def _load_dev_examples(self): + self.dev_file = os.path.join(self.dataset_dir, "dev.tsv") + self.dev_examples = self._read_tsv(self.dev_file) + + def _load_test_examples(self): + self.test_file = os.path.join(self.dataset_dir, "test.tsv") + self.test_examples = self._read_tsv(self.test_file) + + def get_train_examples(self): + return self.train_examples + + def get_dev_examples(self): + return self.dev_examples + + def get_test_examples(self): + return self.test_examples + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _read_tsv(self, input_file, quotechar=None): + """Reads a tab separated value file.""" + with open(input_file, "r") as f: + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + examples = [] + seq_id = 0 + header = next(reader) # skip header + for line in reader: + example = InputExample( + guid=seq_id, label=line[2], text_a=line[0], text_b=line[1]) + seq_id += 1 + examples.append(example) + + return examples + + +if __name__ == "__main__": + ds = NLPCC_DBQA() + for e in ds.get_train_examples(): + print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) diff --git a/paddlehub/dataset/msra_ner.py b/paddlehub/dataset/msra_ner.py index fd091f7e..37d0d78d 100644 --- a/paddlehub/dataset/msra_ner.py +++ b/paddlehub/dataset/msra_ner.py @@ -19,14 +19,19 @@ from collections import namedtuple from paddlehub.common.downloader import default_downloader from paddlehub.common.dir import DATA_HOME +from paddlehub.common.logger import logger DATA_URL = "https://paddlehub-dataset.bj.bcebos.com/msra_ner.tar.gz" class MSRA_NER(object): def __init__(self): - ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress( - url=DATA_URL, save_path=DATA_HOME, print_progress=True) + self.dataset_dir = os.path.join(DATA_HOME, "msra_ner") + if not os.path.exists(self.dataset_dir): + ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress( + url=DATA_URL, save_path=DATA_HOME, print_progress=True) + else: + logger.info("Dataset {} already cached.".format(self.dataset_dir)) self._load_label_map() self._load_train_examples() diff --git a/paddlehub/dataset/nlpcc_dbqa.py b/paddlehub/dataset/nlpcc_dbqa.py new file mode 100644 index 00000000..110930dd --- /dev/null +++ b/paddlehub/dataset/nlpcc_dbqa.py @@ -0,0 +1,84 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import namedtuple +import os +import csv + +from paddlehub.dataset import InputExample, HubDataset +from paddlehub.common.downloader import default_downloader +from paddlehub.common.dir import DATA_HOME +from paddlehub.common.logger import logger + +DATA_URL = "https://paddlehub-dataset.bj.bcebos.com/nlpcc-dbqa.tar.gz" + + +class NLPCC_DBQA(HubDataset): + def __init__(self): + self.dataset_dir = os.path.join(DATA_HOME, "nlpcc-dbqa") + if not os.path.exists(self.dataset_dir): + ret, tips, self.dataset_dir = default_downloader.download_file_and_uncompress( + url=DATA_URL, save_path=DATA_HOME, print_progress=True) + else: + logger.info("Dataset {} already cached.".format(self.dataset_dir)) + + self._load_train_examples() + self._load_test_examples() + self._load_dev_examples() + + def _load_train_examples(self): + self.train_file = os.path.join(self.dataset_dir, "train.tsv") + self.train_examples = self._read_tsv(self.train_file) + + def _load_dev_examples(self): + self.dev_file = os.path.join(self.dataset_dir, "dev.tsv") + self.dev_examples = self._read_tsv(self.dev_file) + + def _load_test_examples(self): + self.test_file = os.path.join(self.dataset_dir, "test.tsv") + self.test_examples = self._read_tsv(self.test_file) + + def get_train_examples(self): + return self.train_examples + + def get_dev_examples(self): + return self.dev_examples + + def get_test_examples(self): + return self.test_examples + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _read_tsv(self, input_file, quotechar=None): + """Reads a tab separated value file.""" + with open(input_file, "r") as f: + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + examples = [] + seq_id = 0 + header = next(reader) # skip header + for line in reader: + example = InputExample( + guid=seq_id, label=line[3], text_a=line[1], text_b=line[2]) + seq_id += 1 + examples.append(example) + + return examples + + +if __name__ == "__main__": + ds = NLPCC_DBQA() + for e in ds.get_train_examples(): + print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) diff --git a/paddlehub/finetune/evaluate.py b/paddlehub/finetune/evaluate.py index 758a0b66..44d23d7a 100644 --- a/paddlehub/finetune/evaluate.py +++ b/paddlehub/finetune/evaluate.py @@ -48,6 +48,8 @@ def evaluate_cls_task(task, data_reader, feed_list, phase="test", config=None): feed=data_feeder.feed(batch), fetch_list=[loss.name, accuracy.name]) num_eval_examples += num_batch_examples + if num_eval_examples % 10000 == 0: + logger.info("{} examples evaluated.".format(num_eval_examples)) acc_sum += accuracy_v * num_batch_examples loss_sum += loss_v * num_batch_examples eval_time_used = time.time() - eval_time_begin diff --git a/paddlehub/reader/nlp_reader.py b/paddlehub/reader/nlp_reader.py index 1e0ae39f..549b032b 100644 --- a/paddlehub/reader/nlp_reader.py +++ b/paddlehub/reader/nlp_reader.py @@ -18,6 +18,7 @@ import numpy as np from collections import namedtuple from paddlehub.reader import tokenization +from paddlehub.common.logger import logger from .batching import pad_batch_data @@ -46,7 +47,7 @@ class BaseReader(object): self.label_map = {} for index, label in enumerate(self.dataset.get_labels()): self.label_map[label] = index - print("Dataset label map = {}".format(self.label_map)) + logger.info("Dataset label map = {}".format(self.label_map)) self.current_example = 0 self.current_epoch = 0 @@ -154,6 +155,9 @@ class BaseReader(object): position_ids = list(range(len(token_ids))) if self.label_map: + if example.label not in self.label_map: + raise KeyError( + "example.label = {%s} not in label" % example.label) label_id = self.label_map[example.label] else: label_id = example.label -- GitLab