diff --git a/demo/multi_label_classification/multi_label_classifier.py b/demo/multi_label_classification/multi_label_classifier.py index 76645d2f88fb390e3b36ea3e2c86809d17451284..34d94713e0e6c8810b262f42ebea078e7bafa049 100644 --- a/demo/multi_label_classification/multi_label_classifier.py +++ b/demo/multi_label_classification/multi_label_classifier.py @@ -39,18 +39,17 @@ if __name__ == '__main__': inputs, outputs, program = module.context( trainable=True, max_seq_len=args.max_seq_len) - # Download dataset and use MultiLabelReader to read dataset - dataset = hub.dataset.Toxic() - reader = hub.reader.MultiLabelClassifyReader( - dataset=dataset, - vocab_path=module.get_vocab_path(), - max_seq_len=args.max_seq_len) - - # Setup feed list for data feeder - feed_list = [ - inputs["input_ids"].name, inputs["position_ids"].name, - inputs["segment_ids"].name, inputs["input_mask"].name - ] + # Use the appropriate tokenizer to preprocess the data set + # For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9 + if module.name == "ernie_tiny": + tokenizer = hub.ErnieTinyTokenizer( + vocab_file=module.get_vocab_path(), + spm_path=module.get_spm_path(), + word_dict_path=module.get_word_dict_path()) + else: + tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path()) + dataset = hub.dataset.Toxic( + tokenizer=tokenizer, max_seq_len=args.max_seq_len) # Construct transfer learning network # Use "pooled_output" for classification tasks on an entire sentence. @@ -72,9 +71,8 @@ if __name__ == '__main__': # Define a classfication fine-tune task by PaddleHub's API multi_label_cls_task = hub.MultiLabelClassifierTask( - data_reader=reader, + dataset=dataset, feature=pooled_output, - feed_list=feed_list, num_classes=dataset.num_labels, config=config) diff --git a/demo/multi_label_classification/predict.py b/demo/multi_label_classification/predict.py index bcd849061e5a663933a83c9a39b2d0d5cf2f8705..bdf6b6f83581fc2b3ca11ff580b5eb02092e1347 100644 --- a/demo/multi_label_classification/predict.py +++ b/demo/multi_label_classification/predict.py @@ -45,20 +45,11 @@ if __name__ == '__main__': inputs, outputs, program = module.context( trainable=True, max_seq_len=args.max_seq_len) - # Download dataset and use MultiLabelReader to read dataset + # Download dataset and get its label list and label num + # If you just want labels information, you can omit its tokenizer parameter to avoid preprocessing the train set. dataset = hub.dataset.Toxic() - reader = hub.reader.MultiLabelClassifyReader( - dataset=dataset, - vocab_path=module.get_vocab_path(), - max_seq_len=args.max_seq_len) - - # Setup feed list for data feeder - feed_list = [ - inputs["input_ids"].name, - inputs["position_ids"].name, - inputs["segment_ids"].name, - inputs["input_mask"].name, - ] + num_classes = dataset.num_labels + label_list = dataset.get_labels() # Construct transfer learning network # Use "pooled_output" for classification tasks on an entire sentence. @@ -75,20 +66,29 @@ if __name__ == '__main__': # Define a classfication fine-tune task by PaddleHub's API multi_label_cls_task = hub.MultiLabelClassifierTask( - data_reader=reader, + dataset=dataset, feature=pooled_output, - feed_list=feed_list, num_classes=dataset.num_labels, config=config) # Data to be predicted data = [ - [ - "Yes you did. And you admitted to doing it. See the Warren Kinsella talk page." - ], - [ - "I asked you a question. We both know you have my page on your watch list, so are why are you playing games and making me formally ping you? Makin'Bacon" - ], + "Yes you did. And you admitted to doing it. See the Warren Kinsella talk page.", + "I asked you a question. We both know you have my page on your watch list, so are why are you playing games and making me formally ping you? Makin'Bacon", ] + # Use the appropriate tokenizer to preprocess the data + # For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9 + if module.name == "ernie_tiny": + tokenizer = hub.ErnieTinyTokenizer( + vocab_file=module.get_vocab_path(), + spm_path=module.get_spm_path(), + word_dict_path=module.get_word_dict_path()) + else: + tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path()) - print(multi_label_cls_task.predict(data=data, return_result=True)) + encoded_data = [ + tokenizer.encode(text=text, max_seq_len=args.max_seq_len) + for text in data + ] + print( + multi_label_cls_task.predict(data=encoded_data, label_list=label_list)) diff --git a/demo/qa_classification/classifier.py b/demo/qa_classification/classifier.py index 70f22a70938017ca270f0d3577a1574053c0fa9f..86f91bbd79ff9eb2c087e610c0b93c768204976b 100644 --- a/demo/qa_classification/classifier.py +++ b/demo/qa_classification/classifier.py @@ -36,31 +36,28 @@ args = parser.parse_args() if __name__ == '__main__': # Load Paddlehub ERNIE pretrained model - module = hub.Module(name="ernie") + module = hub.Module(name="ernie_tiny") inputs, outputs, program = module.context( trainable=True, max_seq_len=args.max_seq_len) - # Download dataset and use ClassifyReader to read dataset - dataset = hub.dataset.NLPCC_DBQA() - reader = hub.reader.ClassifyReader( - dataset=dataset, - vocab_path=module.get_vocab_path(), - max_seq_len=args.max_seq_len) + # Use the appropriate tokenizer to preprocess the data set + # For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9 + if module.name == "ernie_tiny": + tokenizer = hub.ErnieTinyTokenizer( + vocab_file=module.get_vocab_path(), + spm_path=module.get_spm_path(), + word_dict_path=module.get_word_dict_path()) + else: + tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path()) + + dataset = hub.dataset.NLPCC_DBQA( + tokenizer=tokenizer, max_seq_len=args.max_seq_len) # Construct transfer learning network # Use "pooled_output" for classification tasks on an entire sentence. # Use "sequence_output" for token-level output. pooled_output = outputs["pooled_output"] - # Setup feed list for data feeder - # Must feed all the tensor of ERNIE's module need - feed_list = [ - inputs["input_ids"].name, - inputs["position_ids"].name, - inputs["segment_ids"].name, - inputs["input_mask"].name, - ] - # Select fine-tune strategy, setup config and fine-tune strategy = hub.AdamWeightDecayStrategy( warmup_proportion=args.warmup_proportion, @@ -78,9 +75,8 @@ if __name__ == '__main__': # Define a classfication fine-tune task by PaddleHub's API cls_task = hub.TextClassifierTask( - data_reader=reader, + dataset=dataset, feature=pooled_output, - feed_list=feed_list, num_classes=dataset.num_labels, config=config) diff --git a/demo/qa_classification/predict.py b/demo/qa_classification/predict.py index 170319d2ee55f0c8060d42fb3f18ec920152ccc7..df27c69b270e3ec8f2c6638398c50e345540cf52 100644 --- a/demo/qa_classification/predict.py +++ b/demo/qa_classification/predict.py @@ -39,30 +39,20 @@ args = parser.parse_args() if __name__ == '__main__': # loading Paddlehub ERNIE pretrained model - module = hub.Module(name="ernie") + module = hub.Module(name="ernie_tiny") inputs, outputs, program = module.context(max_seq_len=args.max_seq_len) - # Sentence classification dataset reader + # Download dataset and get its label list and label num + # If you just want labels information, you can omit its tokenizer parameter to avoid preprocessing the train set. dataset = hub.dataset.NLPCC_DBQA() - reader = hub.reader.ClassifyReader( - dataset=dataset, - vocab_path=module.get_vocab_path(), - max_seq_len=args.max_seq_len) + num_classes = dataset.num_labels + label_list = dataset.get_labels() # Construct transfer learning network # Use "pooled_output" for classification tasks on an entire sentence. # Use "sequence_output" for token-level output. pooled_output = outputs["pooled_output"] - # Setup feed list for data feeder - # Must feed all the tensor of ERNIE's module need - feed_list = [ - inputs["input_ids"].name, - inputs["position_ids"].name, - inputs["segment_ids"].name, - inputs["input_mask"].name, - ] - # Setup RunConfig for PaddleHub Fine-tune API config = hub.RunConfig( use_data_parallel=False, @@ -73,9 +63,8 @@ if __name__ == '__main__': # Define a classfication fine-tune task by PaddleHub's API cls_task = hub.TextClassifierTask( - data_reader=reader, + dataset=dataset, feature=pooled_output, - feed_list=feed_list, num_classes=dataset.num_labels, config=config) @@ -83,5 +72,18 @@ if __name__ == '__main__': data = [["北京奥运博物馆的场景效果负责人是谁?", "主要承担奥运文物征集、保管、研究和爱国主义教育基地建设相关工作。"], ["北京奥运博物馆的场景效果负责人是谁", "于海勃,美国加利福尼亚大学教授 场景效果负责人 总设计师"], ["北京奥运博物馆的场景效果负责人是谁?", "洪麦恩,清华大学美术学院教授 内容及主展线负责人 总设计师"]] - - print(cls_task.predict(data=data, return_result=True)) + # Use the appropriate tokenizer to preprocess the data + # For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9 + if module.name == "ernie_tiny": + tokenizer = hub.ErnieTinyTokenizer( + vocab_file=module.get_vocab_path(), + spm_path=module.get_spm_path(), + word_dict_path=module.get_word_dict_path()) + else: + tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path()) + encoded_data = [ + tokenizer.encode( + text=text, text_pair=text_pair, max_seq_len=args.max_seq_len) + for text, text_pair in data + ] + print(cls_task.predict(data=encoded_data, label_list=label_list)) diff --git a/demo/reading_comprehension/reading_comprehension.py b/demo/reading_comprehension/reading_comprehension.py index d4793823d2147ecb6f8badb776d4cb827b541a8d..51b6a62b1d4dd3f4ca3ed22c76d4b9fe2b55d00e 100644 --- a/demo/reading_comprehension/reading_comprehension.py +++ b/demo/reading_comprehension/reading_comprehension.py @@ -17,7 +17,6 @@ import argparse import ast -import paddle.fluid as fluid import paddlehub as hub hub.common.logger.logger.setLevel("INFO") @@ -42,28 +41,23 @@ if __name__ == '__main__': inputs, outputs, program = module.context( trainable=True, max_seq_len=args.max_seq_len) - # Download dataset and use ReadingComprehensionReader to read dataset + # Use the appropriate tokenizer to preprocess the data set + # For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9 + if module.name == "ernie_tiny": + tokenizer = hub.ErnieTinyTokenizer( + vocab_file=module.get_vocab_path(), + spm_path=module.get_spm_path(), + word_dict_path=module.get_word_dict_path()) + else: + tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path()) + # If you wanna load SQuAD 2.0 dataset, just set version_2_with_negative as True - dataset = hub.dataset.SQUAD(version_2_with_negative=False) + dataset = hub.dataset.SQUAD( + version_2_with_negative=False, + tokenizer=tokenizer, + max_seq_len=args.max_seq_len) # dataset = hub.dataset.SQUAD(version_2_with_negative=True) - reader = hub.reader.ReadingComprehensionReader( - dataset=dataset, - vocab_path=module.get_vocab_path(), - max_seq_len=args.max_seq_len, - doc_stride=128, - max_query_length=64) - - seq_output = outputs["sequence_output"] - - # Setup feed list for data feeder - feed_list = [ - inputs["input_ids"].name, - inputs["position_ids"].name, - inputs["segment_ids"].name, - inputs["input_mask"].name, - ] - # Select fine-tune strategy, setup config and fine-tune strategy = hub.AdamWeightDecayStrategy( weight_decay=args.weight_decay, @@ -72,7 +66,7 @@ if __name__ == '__main__': # Setup RunConfig for PaddleHub Fine-tune API config = hub.RunConfig( - eval_interval=300, + eval_interval=100, use_data_parallel=args.use_data_parallel, use_cuda=args.use_gpu, num_epoch=args.num_epoch, @@ -82,9 +76,8 @@ if __name__ == '__main__': # Define a reading comprehension fine-tune task by PaddleHub's API reading_comprehension_task = hub.ReadingComprehensionTask( - data_reader=reader, - feature=seq_output, - feed_list=feed_list, + dataset=dataset, + feature=outputs["sequence_output"], config=config, sub_task="squad", ) diff --git a/demo/regression/predict.py b/demo/regression/predict.py index b9e73d995f9c63fd847bda46561bd35c66a31f2a..9c6e5389fc7a2b9d4a2639b2200f3870c4251802 100644 --- a/demo/regression/predict.py +++ b/demo/regression/predict.py @@ -20,12 +20,6 @@ from __future__ import print_function import argparse import ast -import numpy as np -import os -import time - -import paddle -import paddle.fluid as fluid import paddlehub as hub # yapf: disable @@ -43,27 +37,11 @@ if __name__ == '__main__': inputs, outputs, program = module.context( trainable=True, max_seq_len=args.max_seq_len) - # Download dataset and use RegressionReader to read dataset - dataset = hub.dataset.GLUE("STS-B") - reader = hub.reader.RegressionReader( - dataset=dataset, - vocab_path=module.get_vocab_path(), - max_seq_len=args.max_seq_len) - # Construct transfer learning network # Use "pooled_output" for classification tasks on an entire sentence. # Use "sequence_output" for token-level output. pooled_output = outputs["pooled_output"] - # Setup feed list for data feeder - # Must feed all the tensor of ERNIE's module need - feed_list = [ - inputs["input_ids"].name, - inputs["position_ids"].name, - inputs["segment_ids"].name, - inputs["input_mask"].name, - ] - # Setup RunConfig for PaddleHub Fine-tune API config = hub.RunConfig( use_data_parallel=False, @@ -74,13 +52,22 @@ if __name__ == '__main__': # Define a regression fine-tune task by PaddleHub's API reg_task = hub.RegressionTask( - data_reader=reader, feature=pooled_output, - feed_list=feed_list, config=config, ) - # Data to be prdicted - data = [[d.text_a, d.text_b] for d in dataset.get_predict_examples()[:10]] - - print(reg_task.predict(data=data, return_result=True)) + # STS-B has provided the predict data, and the dataset has process it. If you want to process customized data, + # see the predict.py in text_classification demo + # Use the appropriate tokenizer to preprocess the data + # For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9 + if module.name == "ernie_tiny": + tokenizer = hub.ErnieTinyTokenizer( + vocab_file=module.get_vocab_path(), + spm_path=module.get_spm_path(), + word_dict_path=module.get_word_dict_path()) + else: + tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path()) + dataset = hub.dataset.GLUE( + "STS-B", tokenizer=tokenizer, max_seq_len=args.max_seq_len) + encoded_data = dataset.get_predict_records()[:10] + print(reg_task.predict(data=encoded_data)) diff --git a/demo/regression/regression.py b/demo/regression/regression.py index 0979e1c639ca728c46151ad151aaaa9bd389ecc1..bc2eb6a34f1bede0f1bc5e72ef94265b36ef6853 100644 --- a/demo/regression/regression.py +++ b/demo/regression/regression.py @@ -17,7 +17,6 @@ import argparse import ast -import paddle.fluid as fluid import paddlehub as hub # yapf: disable @@ -41,27 +40,24 @@ if __name__ == '__main__': inputs, outputs, program = module.context( trainable=True, max_seq_len=args.max_seq_len) - # Download dataset and use RegressionReader to read dataset - dataset = hub.dataset.GLUE("STS-B") - reader = hub.reader.RegressionReader( - dataset=dataset, - vocab_path=module.get_vocab_path(), - max_seq_len=args.max_seq_len) + # Use the appropriate tokenizer to preprocess the data set + # For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9 + if module.name == "ernie_tiny": + tokenizer = hub.ErnieTinyTokenizer( + vocab_file=module.get_vocab_path(), + spm_path=module.get_spm_path(), + word_dict_path=module.get_word_dict_path()) + else: + tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path()) + + dataset = hub.dataset.GLUE( + "STS-B", tokenizer=tokenizer, max_seq_len=args.max_seq_len) # Construct transfer learning network # Use "pooled_output" for classification tasks on an entire sentence. # Use "sequence_output" for token-level output. pooled_output = outputs["pooled_output"] - # Setup feed list for data feeder - # Must feed all the tensor of ERNIE's module need - feed_list = [ - inputs["input_ids"].name, - inputs["position_ids"].name, - inputs["segment_ids"].name, - inputs["input_mask"].name, - ] - # Select fine-tune strategy, setup config and fine-tune strategy = hub.AdamWeightDecayStrategy( warmup_proportion=args.warmup_proportion, @@ -70,7 +66,6 @@ if __name__ == '__main__': # Setup RunConfig for PaddleHub Fine-tune API config = hub.RunConfig( - eval_interval=300, use_data_parallel=args.use_data_parallel, use_cuda=args.use_gpu, num_epoch=args.num_epoch, @@ -80,10 +75,7 @@ if __name__ == '__main__': # Define a regression fine-tune task by PaddleHub's API reg_task = hub.RegressionTask( - data_reader=reader, - feature=pooled_output, - feed_list=feed_list, - config=config) + dataset=dataset, feature=pooled_output, config=config) # Fine-tune and evaluate by PaddleHub's API # will finish training, evaluation, testing, save model automatically diff --git a/demo/sequence_labeling/predict.py b/demo/sequence_labeling/predict.py index 54deb81d41f848719b7d1263b56b0cdadefa7de4..4db8da5e88a74e180234296b0c365e578c7e41db 100644 --- a/demo/sequence_labeling/predict.py +++ b/demo/sequence_labeling/predict.py @@ -42,30 +42,16 @@ if __name__ == '__main__': module = hub.Module(name="ernie_tiny") inputs, outputs, program = module.context(max_seq_len=args.max_seq_len) - # Sentence labeling dataset reader + # Download dataset and get its label list and label num + # If you just want labels information, you can omit its tokenizer parameter to avoid preprocessing the train set. dataset = hub.dataset.MSRA_NER() - reader = hub.reader.SequenceLabelReader( - dataset=dataset, - vocab_path=module.get_vocab_path(), - max_seq_len=args.max_seq_len, - sp_model_path=module.get_spm_path(), - word_dict_path=module.get_word_dict_path()) - - inv_label_map = {val: key for key, val in reader.label_map.items()} + num_classes = dataset.num_labels + label_list = dataset.get_labels() # Construct transfer learning network # Use "sequence_output" for token-level output. sequence_output = outputs["sequence_output"] - # Setup feed list for data feeder - # Must feed all the tensor of ERNIE's module need - feed_list = [ - inputs["input_ids"].name, - inputs["position_ids"].name, - inputs["segment_ids"].name, - inputs["input_mask"].name, - ] - # Setup RunConfig for PaddleHub Fine-tune API config = hub.RunConfig( use_data_parallel=False, @@ -77,33 +63,31 @@ if __name__ == '__main__': # Define a sequence labeling fine-tune task by PaddleHub's API # if add crf, the network use crf as decoder seq_label_task = hub.SequenceLabelTask( - data_reader=reader, feature=sequence_output, - feed_list=feed_list, max_seq_len=args.max_seq_len, - num_classes=dataset.num_labels, + num_classes=num_classes, config=config, add_crf=False) # Data to be predicted - # If using python 2, prefix "u" is necessary - data = [ - [u"我们变而以书会友,以书结缘,把欧美、港台流行的食品类图谱、画册、工具书汇集一堂。"], - [u"为了跟踪国际最新食品工艺、流行趋势,大量搜集海外专业书刊资料是提高技艺的捷径。"], - [u"其中线装古籍逾千册;民国出版物几百种;珍本四册、稀见本四百余册,出版时间跨越三百余年。"], - [u"有的古木交柯,春机荣欣,从诗人句中得之,而入画中,观之令人心驰。"], - [u"不过重在晋趣,略增明人气息,妙在集古有道、不露痕迹罢了。"], + text_a = [ + "我们变而以书会友,以书结缘,把欧美、港台流行的食品类图谱、画册、工具书汇集一堂。", + "为了跟踪国际最新食品工艺、流行趋势,大量搜集海外专业书刊资料是提高技艺的捷径。", + "其中线装古籍逾千册;民国出版物几百种;珍本四册、稀见本四百余册,出版时间跨越三百余年。", + "有的古木交柯,春机荣欣,从诗人句中得之,而入画中,观之令人心驰。", + "不过重在晋趣,略增明人气息,妙在集古有道、不露痕迹罢了。", ] # Add 0x02 between characters to match the format of training data, # otherwise the length of prediction results will not match the input string # if the input string contains non-Chinese characters. - tmp_data = [] - for example in data: - formatted = [] - for sentence in example: - formatted.append('\x02'.join(list(sentence))) - tmp_data.append(formatted) - data = tmp_data + formatted_text_a = list(map("\002".join, text_a)) - print(seq_label_task.predict(data=data, return_result=True)) + # Use the appropriate tokenizer to preprocess the data + # For ernie_tiny, it use BertTokenizer too. + tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path()) + encoded_data = [ + tokenizer.encode(text=text, max_seq_len=args.max_seq_len) + for text in formatted_text_a + ] + print(seq_label_task.predict(data=encoded_data, label_list=label_list)) diff --git a/demo/sequence_labeling/sequence_label.py b/demo/sequence_labeling/sequence_label.py index 958f9839b9fa1ea4655dec20e56165eaf7883da1..e1012e26496e416e7d55e3baa46ae93beb31da19 100644 --- a/demo/sequence_labeling/sequence_label.py +++ b/demo/sequence_labeling/sequence_label.py @@ -40,26 +40,16 @@ if __name__ == '__main__': inputs, outputs, program = module.context( trainable=True, max_seq_len=args.max_seq_len) - # Download dataset and use SequenceLabelReader to read dataset - dataset = hub.dataset.MSRA_NER() - reader = hub.reader.SequenceLabelReader( - dataset=dataset, - vocab_path=module.get_vocab_path(), - max_seq_len=args.max_seq_len, - sp_model_path=module.get_spm_path(), - word_dict_path=module.get_word_dict_path()) + # Use the appropriate tokenizer to preprocess the data set + # For ernie_tiny, it use BertTokenizer too. + tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path()) + dataset = hub.dataset.MSRA_NER( + tokenizer=tokenizer, max_seq_len=args.max_seq_len) # Construct transfer learning network # Use "sequence_output" for token-level output. sequence_output = outputs["sequence_output"] - # Setup feed list for data feeder - # Must feed all the tensor of module need - feed_list = [ - inputs["input_ids"].name, inputs["position_ids"].name, - inputs["segment_ids"].name, inputs["input_mask"].name - ] - # Select a fine-tune strategy strategy = hub.AdamWeightDecayStrategy( warmup_proportion=args.warmup_proportion, @@ -78,9 +68,8 @@ if __name__ == '__main__': # Define a sequence labeling fine-tune task by PaddleHub's API # If add crf, the network use crf as decoder seq_label_task = hub.SequenceLabelTask( - data_reader=reader, + dataset=dataset, feature=sequence_output, - feed_list=feed_list, max_seq_len=args.max_seq_len, num_classes=dataset.num_labels, config=config, diff --git a/demo/sequence_labeling/sequence_label_dygraph.py b/demo/sequence_labeling/sequence_label_dygraph.py index f95de0a278970c9a0ed45e38f7554f698b464a05..58f9378898dc96c8059f9340a7a01cce6bbe03aa 100644 --- a/demo/sequence_labeling/sequence_label_dygraph.py +++ b/demo/sequence_labeling/sequence_label_dygraph.py @@ -21,9 +21,9 @@ parser.add_argument("--max_seq_len", type=int, default=512, # yapf: enable. -class TransformerSequenceLabelLayer(fluid.dygraph.Layer): +class TransformerSeqLabeling(fluid.dygraph.Layer): def __init__(self, num_classes, transformer): - super(TransformerSequenceLabelLayer, self).__init__() + super(TransformerSeqLabeling, self).__init__() self.num_classes = num_classes self.transformer = transformer self.fc = Linear(input_dim=768, output_dim=num_classes) @@ -39,11 +39,15 @@ class TransformerSequenceLabelLayer(fluid.dygraph.Layer): def finetune(args): - ernie = hub.Module(name="ernie", max_seq_len=args.max_seq_len) + module = hub.Module(name="ernie", max_seq_len=args.max_seq_len) + # Use the appropriate tokenizer to preprocess the data set + tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path()) + dataset = hub.dataset.MSRA_NER( + tokenizer=tokenizer, max_seq_len=args.max_seq_len) + with fluid.dygraph.guard(): - dataset = hub.dataset.MSRA_NER() - ts = TransformerSequenceLabelLayer( - num_classes=dataset.num_labels, transformer=ernie) + ts = TransformerSeqLabeling( + num_classes=dataset.num_labels, transformer=module) adam = AdamOptimizer(learning_rate=1e-5, parameter_list=ts.parameters()) state_dict_path = os.path.join(args.checkpoint_dir, 'dygraph_state_dict') @@ -51,34 +55,32 @@ def finetune(args): state_dict, _ = fluid.load_dygraph(state_dict_path) ts.load_dict(state_dict) - reader = hub.reader.SequenceLabelReader( - dataset=dataset, - vocab_path=ernie.get_vocab_path(), - max_seq_len=args.max_seq_len, - sp_model_path=ernie.get_spm_path(), - word_dict_path=ernie.get_word_dict_path()) - train_reader = reader.data_generator( - batch_size=args.batch_size, phase='train') - loss_sum = total_infer = total_label = total_correct = cnt = 0 - # 执行epoch_num次训练 for epoch in range(args.num_epoch): - # 读取训练数据进行训练 - for batch_id, data in enumerate(train_reader()): - input_ids = np.array(data[0][0]).astype(np.int64) - position_ids = np.array(data[0][1]).astype(np.int64) - segment_ids = np.array(data[0][2]).astype(np.int64) - input_mask = np.array(data[0][3]).astype(np.float32) - labels = np.array(data[0][4]).astype(np.int64).reshape(-1, 1) - seq_len = np.squeeze( - np.array(data[0][5]).astype(np.int64), axis=1) + for batch_id, data in enumerate( + dataset.batch_records_generator( + phase="train", + batch_size=args.batch_size, + shuffle=True, + pad_to_batch_max_seq_len=False)): + batch_size = len(data["input_ids"]) + input_ids = np.array(data["input_ids"]).astype( + np.int64).reshape([batch_size, -1, 1]) + position_ids = np.array(data["position_ids"]).astype( + np.int64).reshape([batch_size, -1, 1]) + segment_ids = np.array(data["segment_ids"]).astype( + np.int64).reshape([batch_size, -1, 1]) + input_mask = np.array(data["input_mask"]).astype( + np.float32).reshape([batch_size, -1, 1]) + labels = np.array(data["label"]).astype(np.int64).reshape(-1, 1) + seq_len = np.array(data["seq_len"]).astype(np.int64).reshape( + -1, 1) pred, ret_infers = ts(input_ids, position_ids, segment_ids, input_mask) loss = fluid.layers.cross_entropy(pred, to_variable(labels)) avg_loss = fluid.layers.mean(loss) avg_loss.backward() - # 参数更新 adam.minimize(avg_loss) loss_sum += avg_loss.numpy() * labels.shape[0] diff --git a/demo/text_classification/predict.py b/demo/text_classification/predict.py index 3a63e63b1078d537e502aad0613cccd712186b72..85a938cd4c173260ef9b9f39c39eea64ac446f52 100644 --- a/demo/text_classification/predict.py +++ b/demo/text_classification/predict.py @@ -20,11 +20,7 @@ from __future__ import print_function import argparse import ast -import numpy as np -import os -import time -import paddle -import paddle.fluid as fluid + import paddlehub as hub # yapf: disable @@ -43,32 +39,11 @@ if __name__ == '__main__': inputs, outputs, program = module.context( trainable=True, max_seq_len=args.max_seq_len) - # Download dataset and use accuracy as metrics - # Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC + # Download dataset and get its label list and label num + # If you just want labels information, you can omit its tokenizer parameter to avoid preprocessing the train set. dataset = hub.dataset.ChnSentiCorp() - - # For ernie_tiny, it use sub-word to tokenize chinese sentence - # If not ernie tiny, sp_model_path and word_dict_path should be set None - reader = hub.reader.ClassifyReader( - dataset=dataset, - vocab_path=module.get_vocab_path(), - max_seq_len=args.max_seq_len, - sp_model_path=module.get_spm_path(), - word_dict_path=module.get_word_dict_path()) - - # Construct transfer learning network - # Use "pooled_output" for classification tasks on an entire sentence. - # Use "sequence_output" for token-level output. - pooled_output = outputs["pooled_output"] - - # Setup feed list for data feeder - # Must feed all the tensor of module need - feed_list = [ - inputs["input_ids"].name, - inputs["position_ids"].name, - inputs["segment_ids"].name, - inputs["input_mask"].name, - ] + num_classes = dataset.num_labels + label_list = dataset.get_labels() # Setup RunConfig for PaddleHub Fine-tune API config = hub.RunConfig( @@ -80,14 +55,26 @@ if __name__ == '__main__': # Define a classfication fine-tune task by PaddleHub's API cls_task = hub.TextClassifierTask( - data_reader=reader, - feature=pooled_output, - feed_list=feed_list, - num_classes=dataset.num_labels, + feature=outputs["pooled_output"], + num_classes=num_classes, config=config) # Data to be prdicted - data = [["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"], ["交通方便;环境很好;服务态度很好 房间较小"], - ["19天硬盘就罢工了~~~算上运来的一周都没用上15天~~~可就是不能换了~~~唉~~~~你说这算什么事呀~~~"]] - - print(cls_task.predict(data=data, return_result=True)) + text_a = [ + "这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般", "交通方便;环境很好;服务态度很好 房间较小", + "19天硬盘就罢工了~~~算上运来的一周都没用上15天~~~可就是不能换了~~~唉~~~~你说这算什么事呀~~~" + ] + # Use the appropriate tokenizer to preprocess the data + # For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9 + if module.name == "ernie_tiny": + tokenizer = hub.ErnieTinyTokenizer( + vocab_file=module.get_vocab_path(), + spm_path=module.get_spm_path(), + word_dict_path=module.get_word_dict_path()) + else: + tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path()) + encoded_data = [ + tokenizer.encode(text=text, max_seq_len=args.max_seq_len) + for text in text_a + ] + print(cls_task.predict(data=encoded_data, label_list=label_list)) diff --git a/demo/text_classification/predict_predefine_net.py b/demo/text_classification/predict_predefine_net.py index 3255270310527b81c3eb272d8331ff7ce3dfd3b3..ececf7b78b3933fefc9e4722463f9a746a47f3f3 100644 --- a/demo/text_classification/predict_predefine_net.py +++ b/demo/text_classification/predict_predefine_net.py @@ -20,11 +20,7 @@ from __future__ import print_function import argparse import ast -import numpy as np -import os -import time -import paddle -import paddle.fluid as fluid + import paddlehub as hub # yapf: disable @@ -44,33 +40,17 @@ if __name__ == '__main__': inputs, outputs, program = module.context( trainable=True, max_seq_len=args.max_seq_len) - # Download dataset and use accuracy as metrics - # Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC + # Download dataset and get its label list and label num + # If you just want labels information, you can omit its tokenizer parameter to avoid preprocessing the train set. dataset = hub.dataset.ChnSentiCorp() - - # For ernie_tiny, it use sub-word to tokenize chinese sentence - # If not ernie tiny, sp_model_path and word_dict_path should be set None - reader = hub.reader.ClassifyReader( - dataset=dataset, - vocab_path=module.get_vocab_path(), - max_seq_len=args.max_seq_len, - sp_model_path=module.get_spm_path(), - word_dict_path=module.get_word_dict_path()) + num_classes = dataset.num_labels + label_list = dataset.get_labels() # Construct transfer learning network # Use "pooled_output" for classification tasks on an entire sentence. # Use "sequence_output" for token-level output. token_feature = outputs["sequence_output"] - # Setup feed list for data feeder - # Must feed all the tensor of module need - feed_list = [ - inputs["input_ids"].name, - inputs["position_ids"].name, - inputs["segment_ids"].name, - inputs["input_mask"].name, - ] - # Setup RunConfig for PaddleHub Fine-tune API config = hub.RunConfig( use_data_parallel=args.use_data_parallel, @@ -85,15 +65,27 @@ if __name__ == '__main__': # you must use the outputs["sequence_output"] as the token_feature of TextClassifierTask, # rather than outputs["pooled_output"], and feature is None cls_task = hub.TextClassifierTask( - data_reader=reader, token_feature=token_feature, - feed_list=feed_list, network=args.network, num_classes=dataset.num_labels, config=config) # Data to be prdicted - data = [["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"], ["交通方便;环境很好;服务态度很好 房间较小"], - ["19天硬盘就罢工了~~~算上运来的一周都没用上15天~~~可就是不能换了~~~唉~~~~你说这算什么事呀~~~"]] - - print(cls_task.predict(data=data, return_result=True)) + text_a = [ + "这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般", "交通方便;环境很好;服务态度很好 房间较小", + "19天硬盘就罢工了~~~算上运来的一周都没用上15天~~~可就是不能换了~~~唉~~~~你说这算什么事呀~~~" + ] + # Use the appropriate tokenizer to preprocess the data + # For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9 + if module.name == "ernie_tiny": + tokenizer = hub.ErnieTinyTokenizer( + vocab_file=module.get_vocab_path(), + spm_path=module.get_spm_path(), + word_dict_path=module.get_word_dict_path()) + else: + tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path()) + encoded_data = [ + tokenizer.encode(text=text, max_seq_len=args.max_seq_len) + for text in text_a + ] + print(cls_task.predict(data=encoded_data, label_list=label_list)) diff --git a/demo/text_classification/run_predict_predefine_net.sh b/demo/text_classification/run_predict_predefine_net.sh index a29e713f226cc7b92a062defeebd984ab960a0ca..0a70c9d19c7f6f24985627bd998a61a726a3e787 100644 --- a/demo/text_classification/run_predict_predefine_net.sh +++ b/demo/text_classification/run_predict_predefine_net.sh @@ -7,5 +7,5 @@ python -u predict_predefine_net.py \ --checkpoint_dir=$CKPT_DIR \ --max_seq_len=128 \ --use_gpu=True \ - --batch_size=24 \ + --batch_size=1 \ --network=bilstm diff --git a/demo/text_classification/text_classifier_dygraph.py b/demo/text_classification/text_classifier_dygraph.py index b648740e1b3a43cc668d9ecfb2ab4c05641bb18d..8ab03225b865b5ce98e20027ddb28d436534a9f0 100644 --- a/demo/text_classification/text_classifier_dygraph.py +++ b/demo/text_classification/text_classifier_dygraph.py @@ -40,11 +40,23 @@ class TransformerClassifier(fluid.dygraph.Layer): def finetune(args): - ernie = hub.Module(name="ernie", max_seq_len=args.max_seq_len) + module = hub.Module(name="ernie", max_seq_len=args.max_seq_len) + # Use the appropriate tokenizer to preprocess the data set + # For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9 + if module.name == "ernie_tiny": + tokenizer = hub.ErnieTinyTokenizer( + vocab_file=module.get_vocab_path(), + spm_path=module.get_spm_path(), + word_dict_path=module.get_word_dict_path(), + ) + else: + tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path()) + dataset = hub.dataset.ChnSentiCorp( + tokenizer=tokenizer, max_seq_len=args.max_seq_len) + with fluid.dygraph.guard(): - dataset = hub.dataset.ChnSentiCorp() tc = TransformerClassifier( - num_classes=dataset.num_labels, transformer=ernie) + num_classes=dataset.num_labels, transformer=module) adam = AdamOptimizer(learning_rate=1e-5, parameter_list=tc.parameters()) state_dict_path = os.path.join(args.checkpoint_dir, 'dygraph_state_dict') @@ -52,32 +64,31 @@ def finetune(args): state_dict, _ = fluid.load_dygraph(state_dict_path) tc.load_dict(state_dict) - reader = hub.reader.ClassifyReader( - dataset=dataset, - vocab_path=ernie.get_vocab_path(), - max_seq_len=args.max_seq_len, - sp_model_path=ernie.get_spm_path(), - word_dict_path=ernie.get_word_dict_path()) - train_reader = reader.data_generator( - batch_size=args.batch_size, phase='train') - loss_sum = acc_sum = cnt = 0 - # 执行epoch_num次训练 for epoch in range(args.num_epoch): - # 读取训练数据进行训练 - for batch_id, data in enumerate(train_reader()): - input_ids = np.array(data[0][0]).astype(np.int64) - position_ids = np.array(data[0][1]).astype(np.int64) - segment_ids = np.array(data[0][2]).astype(np.int64) - input_mask = np.array(data[0][3]).astype(np.float32) - labels = np.array(data[0][4]).astype(np.int64) + for batch_id, data in enumerate( + dataset.batch_records_generator( + phase="train", + batch_size=args.batch_size, + shuffle=True, + pad_to_batch_max_seq_len=False)): + batch_size = len(data["input_ids"]) + input_ids = np.array(data["input_ids"]).astype( + np.int64).reshape([batch_size, -1, 1]) + position_ids = np.array(data["position_ids"]).astype( + np.int64).reshape([batch_size, -1, 1]) + segment_ids = np.array(data["segment_ids"]).astype( + np.int64).reshape([batch_size, -1, 1]) + input_mask = np.array(data["input_mask"]).astype( + np.float32).reshape([batch_size, -1, 1]) + labels = np.array(data["label"]).astype(np.int64).reshape( + [batch_size, 1]) pred = tc(input_ids, position_ids, segment_ids, input_mask) acc = fluid.layers.accuracy(pred, to_variable(labels)) loss = fluid.layers.cross_entropy(pred, to_variable(labels)) avg_loss = fluid.layers.mean(loss) avg_loss.backward() - # 参数更新 adam.minimize(avg_loss) loss_sum += avg_loss.numpy() * labels.shape[0] diff --git a/demo/text_classification/text_cls.py b/demo/text_classification/text_cls.py index b68925ba282775b0c57ceb6b249bc53ac258c55e..ff647c149f49f8273dfd5fb6a0d778405a30eeba 100644 --- a/demo/text_classification/text_cls.py +++ b/demo/text_classification/text_cls.py @@ -16,6 +16,7 @@ import argparse import ast + import paddlehub as hub # yapf: disable @@ -39,35 +40,24 @@ if __name__ == '__main__': inputs, outputs, program = module.context( trainable=True, max_seq_len=args.max_seq_len) - # Download dataset and use accuracy as metrics - # Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC - # metric should be acc, f1 or matthews - dataset = hub.dataset.ChnSentiCorp() - metrics_choices = ["acc"] + # Use the appropriate tokenizer to preprocess the data set + # For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9 + if module.name == "ernie_tiny": + tokenizer = hub.ErnieTinyTokenizer( + vocab_file=module.get_vocab_path(), + spm_path=module.get_spm_path(), + word_dict_path=module.get_word_dict_path()) + else: + tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path()) - # For ernie_tiny, it use sub-word to tokenize chinese sentence - # If not ernie tiny, sp_model_path and word_dict_path should be set None - reader = hub.reader.ClassifyReader( - dataset=dataset, - vocab_path=module.get_vocab_path(), - max_seq_len=args.max_seq_len, - sp_model_path=module.get_spm_path(), - word_dict_path=module.get_word_dict_path()) + dataset = hub.dataset.ChnSentiCorp( + tokenizer=tokenizer, max_seq_len=args.max_seq_len) # Construct transfer learning network # Use "pooled_output" for classification tasks on an entire sentence. # Use "sequence_output" for token-level output. pooled_output = outputs["pooled_output"] - # Setup feed list for data feeder - # Must feed all the tensor of module need - feed_list = [ - inputs["input_ids"].name, - inputs["position_ids"].name, - inputs["segment_ids"].name, - inputs["input_mask"].name, - ] - # Select fine-tune strategy, setup config and fine-tune strategy = hub.AdamWeightDecayStrategy( warmup_proportion=args.warmup_proportion, @@ -85,12 +75,11 @@ if __name__ == '__main__': # Define a classfication fine-tune task by PaddleHub's API cls_task = hub.TextClassifierTask( - data_reader=reader, + dataset=dataset, feature=pooled_output, - feed_list=feed_list, num_classes=dataset.num_labels, config=config, - metrics_choices=metrics_choices) + metrics_choices=["acc"]) # Fine-tune and evaluate by PaddleHub's API # will finish training, evaluation, testing, save model automatically diff --git a/demo/text_classification/text_cls_predefine_net.py b/demo/text_classification/text_cls_predefine_net.py index 4194bb4264bf86631fc9f550cc9b59f421be021d..6f7aab8c898d85ff96cf732d5a08b92af6c21c25 100644 --- a/demo/text_classification/text_cls_predefine_net.py +++ b/demo/text_classification/text_cls_predefine_net.py @@ -40,35 +40,24 @@ if __name__ == '__main__': inputs, outputs, program = module.context( trainable=True, max_seq_len=args.max_seq_len) - # Download dataset and use accuracy as metrics - # Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC - # metric should be acc, f1 or matthews - dataset = hub.dataset.ChnSentiCorp() - metrics_choices = ["acc"] + # Use the appropriate tokenizer to preprocess the data set + # For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9 + if module.name == "ernie_tiny": + tokenizer = hub.ErnieTinyTokenizer( + vocab_file=module.get_vocab_path(), + spm_path=module.get_spm_path(), + word_dict_path=module.get_word_dict_path()) + else: + tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path()) - # For ernie_tiny, it use sub-word to tokenize chinese sentence - # If not ernie tiny, sp_model_path and word_dict_path should be set None - reader = hub.reader.ClassifyReader( - dataset=dataset, - vocab_path=module.get_vocab_path(), - max_seq_len=args.max_seq_len, - sp_model_path=module.get_spm_path(), - word_dict_path=module.get_word_dict_path()) + dataset = hub.dataset.ChnSentiCorp( + tokenizer=tokenizer, max_seq_len=args.max_seq_len) # Construct transfer learning network # Use "pooled_output" for classification tasks on an entire sentence. # Use "sequence_output" for token-level output. token_feature = outputs["sequence_output"] - # Setup feed list for data feeder - # Must feed all the tensor of module need - feed_list = [ - inputs["input_ids"].name, - inputs["position_ids"].name, - inputs["segment_ids"].name, - inputs["input_mask"].name, - ] - # Select fine-tune strategy, setup config and fine-tune strategy = hub.AdamWeightDecayStrategy( warmup_proportion=args.warmup_proportion, @@ -90,13 +79,12 @@ if __name__ == '__main__': # you must use the outputs["sequence_output"] as the token_feature of TextClassifierTask, # rather than outputs["pooled_output"], and feature is None cls_task = hub.TextClassifierTask( - data_reader=reader, + dataset=dataset, token_feature=token_feature, - feed_list=feed_list, network=args.network, num_classes=dataset.num_labels, config=config, - metrics_choices=metrics_choices) + metrics_choices=["acc"]) # Fine-tune and evaluate by PaddleHub's API # will finish training, evaluation, testing, save model automatically diff --git a/paddlehub/__init__.py b/paddlehub/__init__.py index 0b9200ec09c8c7e9188752449695ce58650b99fb..6cd198fcc22e42323b007feb3dc51a999047db52 100644 --- a/paddlehub/__init__.py +++ b/paddlehub/__init__.py @@ -31,6 +31,7 @@ from . import dataset from . import finetune from . import reader from . import network +from . import tokenizer from .common.dir import USER_HOME from .common.dir import HUB_HOME @@ -70,3 +71,6 @@ from .finetune.strategy import CombinedStrategy from .autofinetune.evaluator import report_final_result from .module.nlp_module import NLPPredictionModule, TransformerModule + +from .tokenizer.bert_tokenizer import BertTokenizer +from .tokenizer.bert_tokenizer import ErnieTinyTokenizer diff --git a/paddlehub/dataset/base_nlp_dataset.py b/paddlehub/dataset/base_nlp_dataset.py index 3f22cd9a2bb73a1dcc308d29c878e8304ff9abbe..ddb47c540492d12cbc3be0b698c3cccbfda12a49 100644 --- a/paddlehub/dataset/base_nlp_dataset.py +++ b/paddlehub/dataset/base_nlp_dataset.py @@ -19,7 +19,10 @@ from __future__ import print_function import io import csv +import collections +from tqdm import tqdm +import numpy as np from paddlehub.dataset import InputExample, BaseDataset from paddlehub.common.logger import logger @@ -36,7 +39,9 @@ class BaseNLPDataset(BaseDataset): train_file_with_header=False, dev_file_with_header=False, test_file_with_header=False, - predict_file_with_header=False): + predict_file_with_header=False, + tokenizer=None, + max_seq_len=128): super(BaseNLPDataset, self).__init__( base_path=base_path, train_file=train_file, @@ -49,6 +54,52 @@ class BaseNLPDataset(BaseDataset): dev_file_with_header=dev_file_with_header, test_file_with_header=test_file_with_header, predict_file_with_header=predict_file_with_header) + self.tokenizer = tokenizer + self.max_seq_len = max_seq_len + self._train_records = None + self._dev_records = None + self._test_records = None + self._predict_records = None + + @property + def train_records(self): + if not self._train_records: + examples = self.train_examples + if not self.tokenizer or not examples: + return [] + logger.info("Processing the train set...") + self._train_records = self._convert_examples_to_records(examples) + return self._train_records + + @property + def dev_records(self): + if not self._dev_records: + examples = self.dev_examples + if not self.tokenizer or not examples: + return [] + logger.info("Processing the dev set...") + self._dev_records = self._convert_examples_to_records(examples) + return self._dev_records + + @property + def test_records(self): + if not self._test_records: + examples = self.test_examples + if not self.tokenizer or not examples: + return [] + logger.info("Processing the test set...") + self._test_records = self._convert_examples_to_records(examples) + return self._test_records + + @property + def predict_records(self): + if not self._predict_records: + examples = self.predict_examples + if not self.tokenizer or not examples: + return [] + logger.info("Processing the predict set...") + self._predict_records = self._convert_examples_to_records(examples) + return self._predict_records def _read_file(self, input_file, phase=None): """Reads a tab separated value file.""" @@ -96,3 +147,708 @@ class BaseNLPDataset(BaseDataset): % (input_file)) examples.append(example) return examples + + def _convert_examples_to_records(self, examples): + """ + Returns a list[dict] including all the input information what the model need. + + Args: + examples (list): the data example, returned by _read_file. + + Returns: + a list with all the examples record. + """ + + records = [] + for example in examples: + record = self.tokenizer.encode( + text=example.text_a, + text_pair=example.text_b, + max_seq_len=self.max_seq_len) + if example.label: + record["label"] = self.label_list.index( + example.label) if self.label_list else float(example.label) + records.append(record) + return records + + def get_train_records(self, shuffle=False): + return self.get_records("train", shuffle=shuffle) + + def get_dev_records(self, shuffle=False): + return self.get_records("dev", shuffle=shuffle) + + def get_test_records(self, shuffle=False): + return self.get_records("test", shuffle=shuffle) + + def get_val_records(self, shuffle=False): + return self.get_records("val", shuffle=shuffle) + + def get_predict_records(self, shuffle=False): + return self.get_records("predict", shuffle=shuffle) + + def get_records(self, phase, shuffle=False): + if phase == "train": + records = self.train_records + elif phase == "dev": + records = self.dev_records + elif phase == "test": + records = self.test_records + elif phase == "val": + records = self.dev_records + elif phase == "predict": + records = self.predict_records + else: + raise ValueError("Invalid phase: %s" % phase) + + if shuffle: + np.random.shuffle(records) + return records + + def get_feed_list(self, phase): + records = self.get_records(phase) + if records: + feed_list = list(records[0].keys()) + else: + if phase == "predict": + feed_list = [ + feed_name for feed_name in self.get_feed_list("train") + if feed_name != "label" + ] + else: + feed_list = [ + feed_name for feed_name in self.get_feed_list("train") + ] + return feed_list + + def batch_records_generator(self, + phase, + batch_size, + shuffle=True, + pad_to_batch_max_seq_len=False): + """ generate a batch of records, usually used in dynamic graph mode. + + Args: + phase (str): the dataset phase, can be "train", "dev", "val", "test" or "predict". + batch_size (int): the data batch size + shuffle (bool): if set to True, will shuffle the dataset. + pad_to_batch_max_seq_len (bool): if set to True, will dynamically pad to the max sequence length of the batch data. + Only recommended to set to True when the model has used RNN. + """ + records = self.get_records(phase, shuffle=shuffle) + + batch_records = [] + batch_lens = [] + for record in records: + batch_records.append(record) + if pad_to_batch_max_seq_len: + # This may reduce the processing speed + tokens_wo_pad = [ + token for token in self.tokenizer.decode( + record, only_convert_to_tokens=True) + if token != self.tokenizer.pad_token + ] + batch_lens.append(len(tokens_wo_pad)) + if len(batch_records) == batch_size: + if pad_to_batch_max_seq_len: + # This may reduce the processing speed. + batch_max_seq_len = max(batch_lens) + for record in batch_records: + for key, value in record.items(): + if isinstance(value, list): + # This may not be universal + record[key] = value[:batch_max_seq_len] + rev_batch_records = { + key: [record[key] for record in batch_records] + for key in batch_records[0] + } + yield rev_batch_records + batch_records = [] + batch_lens = [] + + if batch_records: + if pad_to_batch_max_seq_len: + # This may reduce the processing speed. + batch_max_seq_len = max(batch_lens) + for record in batch_records: + for key in record.keys(): + if isinstance(record[key], list): + record[key] = record[key][:batch_max_seq_len] + rev_batch_records = { + key: [record[key] for record in batch_records] + for key in batch_records[0] + } + yield rev_batch_records + + +class TextClassificationDataset(BaseNLPDataset): + def _convert_examples_to_records(self, examples): + """ + Returns a list[dict] including all the input information what the model need. + + Args: + examples (list): the data example, returned by _read_file. + + Returns: + a list with all the examples record. + """ + + records = [] + for example in examples: + record = self.tokenizer.encode( + text=example.text_a, + text_pair=example.text_b, + max_seq_len=self.max_seq_len) + if example.label: + record["label"] = self.label_list.index(example.label) + records.append(record) + return records + + +class RegressionDataset(BaseNLPDataset): + def _convert_examples_to_records(self, examples): + """ + Returns a list[dict] including all the input information what the model need. + + Args: + examples (list): the data example, returned by _read_file. + + Returns: + a list with all the examples record. + """ + + records = [] + for example in examples: + record = self.tokenizer.encode( + text=example.text_a, + text_pair=example.text_b, + max_seq_len=self.max_seq_len) + if example.label: + record["label"] = float(example.label) + records.append(record) + return records + + +class SeqLabelingDataset(BaseNLPDataset): + def __init__(self, + base_path, + train_file=None, + dev_file=None, + test_file=None, + predict_file=None, + label_file=None, + label_list=None, + train_file_with_header=False, + dev_file_with_header=False, + test_file_with_header=False, + predict_file_with_header=False, + tokenizer=None, + max_seq_len=128, + split_char="\002", + no_entity_label="O"): + self.no_entity_label = no_entity_label + self.split_char = split_char + + super(SeqLabelingDataset, self).__init__( + base_path=base_path, + train_file=train_file, + dev_file=dev_file, + test_file=test_file, + predict_file=predict_file, + label_file=label_file, + label_list=label_list, + train_file_with_header=train_file_with_header, + dev_file_with_header=dev_file_with_header, + test_file_with_header=test_file_with_header, + predict_file_with_header=predict_file_with_header, + tokenizer=tokenizer, + max_seq_len=max_seq_len) + + def _convert_examples_to_records(self, examples): + """ + Returns a list[dict] including all the input information what the model need. + + Args: + examples (list): the data examples, returned by _read_file. + + Returns: + a list with all the examples record. + """ + records = [] + for example in examples: + tokens, labels = self._reseg_token_label( + tokens=example.text_a.split(self.split_char), + labels=example.label.split(self.split_char)) + record = self.tokenizer.encode( + text=tokens, max_seq_len=self.max_seq_len) + if labels: + record["label"] = [] + tokens_with_specical_token = self.tokenizer.decode( + record, only_convert_to_tokens=True) + tokens_index = 0 + for token in tokens_with_specical_token: + if tokens_index < len( + tokens) and token == tokens[tokens_index]: + record["label"].append( + self.label_list.index(labels[tokens_index])) + tokens_index += 1 + else: + record["label"].append( + self.label_list.index(self.no_entity_label)) + records.append(record) + return records + + def _reseg_token_label(self, tokens, labels=None): + if labels: + if len(tokens) != len(labels): + raise ValueError( + "The length of tokens must be same with labels") + ret_tokens = [] + ret_labels = [] + for token, label in zip(tokens, labels): + sub_token = self.tokenizer.tokenize(token) + if len(sub_token) == 0: + continue + ret_tokens.extend(sub_token) + ret_labels.append(label) + if len(sub_token) < 2: + continue + sub_label = label + if label.startswith("B-"): + sub_label = "I-" + label[2:] + ret_labels.extend([sub_label] * (len(sub_token) - 1)) + + if len(ret_tokens) != len(ret_labels): + raise ValueError( + "The length of ret_tokens can't match with labels") + return ret_tokens, ret_labels + else: + ret_tokens = [] + for token in tokens: + sub_token = self.tokenizer.tokenize(token) + if len(sub_token) == 0: + continue + ret_tokens.extend(sub_token) + if len(sub_token) < 2: + continue + + return ret_tokens, None + + +class MultiLabelDataset(BaseNLPDataset): + def _convert_examples_to_records(self, examples): + """ + Returns a list[dict] including all the input information what the model need. + + Args: + examples (list): the data examples, returned by _read_file. + max_seq_len (int): padding to the max sequence length. + + Returns: + a list with all the examples record. + """ + records = [] + for example in examples: + record = self.tokenizer.encode( + text=example.text_a, + text_pair=example.text_b, + max_seq_len=self.max_seq_len) + if example.label: + record["label"] = [int(label) for label in example.label] + records.append(record) + return records + + +class MRCDataset(BaseNLPDataset): + def __init__( + self, + base_path, + train_file=None, + dev_file=None, + test_file=None, + predict_file=None, + label_file=None, + label_list=None, + train_file_with_header=False, + dev_file_with_header=False, + test_file_with_header=False, + predict_file_with_header=False, + tokenizer=None, + max_seq_len=128, + max_query_len=64, + doc_stride=128, + ): + + super(BaseNLPDataset, self).__init__( + base_path=base_path, + train_file=train_file, + dev_file=dev_file, + test_file=test_file, + predict_file=predict_file, + label_file=label_file, + label_list=label_list, + train_file_with_header=train_file_with_header, + dev_file_with_header=dev_file_with_header, + test_file_with_header=test_file_with_header, + predict_file_with_header=predict_file_with_header, + ) + + self.tokenizer = tokenizer + self.max_seq_len = max_seq_len + self.max_query_len = max_query_len + self._DocSpan = collections.namedtuple("DocSpan", ["start", "length"]) + self.doc_stride = doc_stride + self._Feature = collections.namedtuple("Feature", [ + "unique_id", + "example_index", + "doc_span_index", + "tokens", + "token_to_orig_map", + "token_is_max_context", + ]) + self.special_tokens_num, self.special_tokens_num_before_doc = self._get_special_tokens_num( + ) + + self._train_records = None + self._dev_records = None + self._test_records = None + self._predict_records = None + self._train_features = None + self._dev_features = None + self._test_features = None + self._predict_features = None + + @property + def train_records(self): + if not self._train_records: + examples = self.train_examples + if not self.tokenizer or not examples: + return [] + logger.info("Processing the train set...") + self._train_records, self._train_features = self._convert_examples_to_records_and_features( + examples, "train") + return self._train_records + + @property + def dev_records(self): + if not self._dev_records: + examples = self.dev_examples + if not self.tokenizer or not examples: + return [] + logger.info("Processing the dev set...") + self._dev_records, self._dev_features = self._convert_examples_to_records_and_features( + examples, "dev") + return self._dev_records + + @property + def test_records(self): + if not self._test_records: + examples = self.test_examples + if not self.tokenizer or not examples: + return [] + logger.info("Processing the test set...") + self._test_records, self._test_features = self._convert_examples_to_records_and_features( + examples, "test") + return self._test_records + + @property + def predict_records(self): + if not self._predict_records: + examples = self.predict_examples + if not self.tokenizer or not examples: + return [] + logger.info("Processing the predict set...") + self._predict_records, self._predict_features = self._convert_examples_to_records_and_features( + examples, "predict") + return self._predict_records + + @property + def train_features(self): + if not self._train_features: + examples = self.train_examples + if not self.tokenizer or not examples: + return [] + logger.info("Processing the train set...") + self._train_records, self._train_features = self._convert_examples_to_records_and_features( + examples, "train") + return self._train_features + + @property + def dev_features(self): + if not self._dev_features: + examples = self.dev_examples + if not self.tokenizer or not examples: + return [] + logger.info("Processing the dev set...") + self._dev_records, self._dev_features = self._convert_examples_to_records_and_features( + examples, "dev") + return self._dev_features + + @property + def test_features(self): + if not self._test_features: + examples = self.test_examples + if not self.tokenizer or not examples: + return [] + logger.info("Processing the test set...") + self._test_records, self._test_features = self._convert_examples_to_records_and_features( + examples, "test") + return self._test_features + + @property + def predict_features(self): + if not self._predict_features: + examples = self.predict_examples + if not self.tokenizer or not examples: + return [] + logger.info("Processing the predict set...") + self._predict_records, self._predict_features = self._convert_examples_to_records_and_features( + examples, "predict") + return self._predict_features + + def _get_special_tokens_num(self): + if not self.tokenizer: + return None, None + # We must have a pad token, so we can use it to make fake text. + fake_question = [self.tokenizer.pad_token] + fake_answer = [self.tokenizer.pad_token] + special_tokens_num = 0 + special_tokens_num_before_doc = 0 + seen_pad_num = 0 + fake_record = self.tokenizer.encode(fake_question, fake_answer) + fake_tokens_with_special_tokens = self.tokenizer.decode( + fake_record, only_convert_to_tokens=True) + for token in fake_tokens_with_special_tokens: + if token == self.tokenizer.pad_token: + seen_pad_num += 1 + if seen_pad_num > 2: + # The third pad_token is added by padding + break + else: + special_tokens_num += 1 + if seen_pad_num < 2: + # The second pad_token is the fake_answer + special_tokens_num_before_doc += 1 + return special_tokens_num, special_tokens_num_before_doc + + def _convert_examples_to_records_and_features(self, examples, phase): + """Loads a data file into a list of `InputBatch`s.""" + features = [] + records = [] + unique_id = 1000000000 + + with tqdm(total=len(examples)) as process_bar: + for (example_index, example) in enumerate(examples): + # Tokenize question_text + query_tokens = self.tokenizer.tokenize(example.question_text) + if len(query_tokens) > self.max_query_len: + query_tokens = query_tokens[0:self.max_query_len] + + # Tokenize doc_tokens and get token-sub_token position map + tok_to_orig_index = [] + orig_to_tok_index = [] + all_doc_tokens = [] + for (i, token) in enumerate(example.doc_tokens): + orig_to_tok_index.append(len(all_doc_tokens)) + sub_tokens = self.tokenizer.tokenize(token) + for sub_token in sub_tokens: + tok_to_orig_index.append(i) + all_doc_tokens.append(sub_token) + + # Update the answer position to the new sub_token position + tok_start_position = None + tok_end_position = None + is_impossible = example.is_impossible if hasattr( + example, "is_impossible") else False + + if phase != "predict" and is_impossible: + tok_start_position = -1 + tok_end_position = -1 + if phase != "predict" and not is_impossible: + tok_start_position = orig_to_tok_index[ + example.start_position] + if example.end_position < len(example.doc_tokens) - 1: + tok_end_position = orig_to_tok_index[ + example.end_position + 1] - 1 + else: + tok_end_position = len(all_doc_tokens) - 1 + (tok_start_position, + tok_end_position) = self.improve_answer_span( + all_doc_tokens, tok_start_position, tok_end_position, + self.tokenizer, example.orig_answer_text) + + # We can have documents that are longer than the maximum sequence length. + # To deal with this we do a sliding window approach, where we take chunks + # of the up to our max length with a stride of `doc_stride`. + # if hasattr(self.tokenizer, "num_special_tokens_to_add"): + max_tokens_for_doc = self.max_seq_len - len( + query_tokens) - self.special_tokens_num + + doc_spans = [] + start_offset = 0 + while start_offset < len(all_doc_tokens): + length = len(all_doc_tokens) - start_offset + if length > max_tokens_for_doc: + length = max_tokens_for_doc + doc_spans.append( + self._DocSpan(start=start_offset, length=length)) + if start_offset + length == len(all_doc_tokens): + break + start_offset += min(length, self.doc_stride) + + for (doc_span_index, doc_span) in enumerate(doc_spans): + # Update the start_position and end_position to doc_span + start_position = None + end_position = None + if phase != "predict": + if is_impossible: + start_position = 0 + end_position = 0 + else: + # For training, if our document chunk does not contain an annotation + # we throw it out, since there is nothing to predict. + doc_start = doc_span.start + doc_end = doc_span.start + doc_span.length - 1 + out_of_span = False + if not (tok_start_position >= doc_start + and tok_end_position <= doc_end): + out_of_span = True + if out_of_span: + start_position = 0 + end_position = 0 + else: + doc_offset = len( + query_tokens + ) + self.special_tokens_num_before_doc + start_position = tok_start_position - doc_start + doc_offset + end_position = tok_end_position - doc_start + doc_offset + + record = self.tokenizer.encode( + text=query_tokens, + text_pair=all_doc_tokens[doc_span.start:doc_span.start + + doc_span.length], + max_seq_len=self.max_seq_len) + record["start_position"] = start_position + record["end_position"] = end_position + record["unique_id"] = unique_id + records.append(record) + + # The other information is saved in feature, which is helpful in postprocessing. + # The bridge with record and feature is unique_id. + tokens = self.tokenizer.decode( + record, only_convert_to_tokens=True) + token_to_orig_map = {} + token_is_max_context = {} + doc_token_start = len( + query_tokens) + self.special_tokens_num_before_doc + for i in range(doc_span.length): + # split_token_index: the doc token position in doc after tokenize + # doc_token_index: the doc token position in record after encode + split_token_index = doc_span.start + i + doc_token_index = doc_token_start + i + token_to_orig_map[doc_token_index] = tok_to_orig_index[ + split_token_index] + is_max_context = self.check_is_max_context( + doc_spans, doc_span_index, split_token_index) + token_is_max_context[doc_token_index] = is_max_context + + feature = self._Feature( + unique_id=unique_id, + example_index=example_index, + doc_span_index=doc_span_index, + tokens=tokens, + token_to_orig_map=token_to_orig_map, + token_is_max_context=token_is_max_context, + ) + features.append(feature) + + unique_id += 1 + process_bar.update(1) + + return records, features + + def improve_answer_span(self, doc_tokens, input_start, input_end, tokenizer, + orig_answer_text): + """Returns tokenized answer spans that better match the annotated answer.""" + + # The SQuAD annotations are character based. We first project them to + # whitespace-tokenized words. But then after WordPiece tokenization, we can + # often find a "better match". For example: + # + # Question: What year was John Smith born? + # Context: The leader was John Smith (1895-1943). + # Answer: 1895 + # + # The original whitespace-tokenized answer will be "(1895-1943).". However + # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match + # the exact answer, 1895. + # + # However, this is not always possible. Consider the following: + # + # Question: What country is the top exporter of electornics? + # Context: The Japanese electronics industry is the lagest in the world. + # Answer: Japan + # + # In this case, the annotator chose "Japan" as a character sub-span of + # the word "Japanese". Since our WordPiece tokenizer does not split + # "Japanese", we just use "Japanese" as the annotation. This is fairly rare + # in SQuAD, but does happen. + tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) + + for new_start in range(input_start, input_end + 1): + for new_end in range(input_end, new_start - 1, -1): + text_span = " ".join(doc_tokens[new_start:(new_end + 1)]) + if text_span == tok_answer_text: + return (new_start, new_end) + + return (input_start, input_end) + + def check_is_max_context(self, doc_spans, cur_span_index, position): + """Check if this is the 'max context' doc span for the token.""" + + # Because of the sliding window approach taken to scoring documents, a single + # token can appear in multiple documents. E.g. + # Doc: the man went to the store and bought a gallon of milk + # Span A: the man went to the + # Span B: to the store and bought + # Span C: and bought a gallon of + # ... + # + # Now the word 'bought' will have two scores from spans B and C. We only + # want to consider the score with "maximum context", which we define as + # the *minimum* of its left and right context (the *sum* of left and + # right context will always be the same, of course). + # + # In the example the maximum context for 'bought' would be span C since + # it has 1 left context and 3 right context, while span B has 4 left context + # and 0 right context. + best_score = None + best_span_index = None + for (span_index, doc_span) in enumerate(doc_spans): + end = doc_span.start + doc_span.length - 1 + if position < doc_span.start: + continue + if position > end: + continue + num_left_context = position - doc_span.start + num_right_context = end - position + score = min(num_left_context, + num_right_context) + 0.01 * doc_span.length + if best_score is None or score > best_score: + best_score = score + best_span_index = span_index + + return cur_span_index == best_span_index + + def get_features(self, phase): + if phase == "train": + return self.train_features + elif phase == "dev": + return self.dev_features + elif phase == "test": + return self.test_features + elif phase == "val": + return self.dev_features + elif phase == "predict": + return self.predict_features + else: + raise ValueError("Invalid phase: %s" % phase) diff --git a/paddlehub/dataset/bq.py b/paddlehub/dataset/bq.py index 25a85d126af6b28f986972babee096f9cc522315..eb52ddeaa6604290de9e0602c6ca1782b6d2439a 100644 --- a/paddlehub/dataset/bq.py +++ b/paddlehub/dataset/bq.py @@ -20,11 +20,16 @@ from __future__ import print_function import os from paddlehub.common.dir import DATA_HOME -from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset +from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset -class BQ(BaseNLPDataset): - def __init__(self): +class BQ(TextClassificationDataset): + """ + The Bank Question (BQ) corpus, a Chinese corpus for sentence semantic equivalence identification (SSEI), + contains 120,000 question pairs from 1-year online bank custom service logs. + """ + + def __init__(self, tokenizer=None, max_seq_len=None): dataset_dir = os.path.join(DATA_HOME, "bq") base_path = self._download_dataset( dataset_dir, @@ -36,18 +41,16 @@ class BQ(BaseNLPDataset): test_file="test.txt", label_file=None, label_list=["0", "1"], - ) + tokenizer=tokenizer, + max_seq_len=max_seq_len) if __name__ == "__main__": - ds = BQ() - print("first 10 dev") + from paddlehub.tokenizer.bert_tokenizer import BertTokenizer + ds = BQ(tokenizer=BertTokenizer(vocab_file='vocab.txt'), max_seq_len=10) + print("first 10 dev examples") for e in ds.get_dev_examples()[:10]: print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) - print("first 10 train") - for e in ds.get_train_examples()[:10]: - print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) - print("first 10 test") - for e in ds.get_test_examples()[:10]: - print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) - print(ds) + print("first 10 dev records") + for e in ds.get_dev_records()[:10]: + print(e) diff --git a/paddlehub/dataset/chnsenticorp.py b/paddlehub/dataset/chnsenticorp.py index 33864c1f8ec68c6331917dd2d3e6669304d4fcd2..58224012622a8cd706264656d25bd8d9b7fd5b28 100644 --- a/paddlehub/dataset/chnsenticorp.py +++ b/paddlehub/dataset/chnsenticorp.py @@ -23,16 +23,16 @@ import csv from paddlehub.dataset import InputExample from paddlehub.common.dir import DATA_HOME -from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset +from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset -class ChnSentiCorp(BaseNLPDataset): +class ChnSentiCorp(TextClassificationDataset): """ ChnSentiCorp (by Tan Songbo at ICT of Chinese Academy of Sciences, and for opinion mining) """ - def __init__(self): + def __init__(self, tokenizer=None, max_seq_len=None): dataset_dir = os.path.join(DATA_HOME, "chnsenticorp") base_path = self._download_dataset( dataset_dir, @@ -44,7 +44,8 @@ class ChnSentiCorp(BaseNLPDataset): test_file="test.tsv", label_file=None, label_list=["0", "1"], - ) + tokenizer=tokenizer, + max_seq_len=max_seq_len) def _read_file(self, input_file, phase=None): """Reads a tab separated value file.""" @@ -63,6 +64,13 @@ class ChnSentiCorp(BaseNLPDataset): if __name__ == "__main__": - ds = ChnSentiCorp() - for e in ds.get_train_examples()[:10]: + from paddlehub.tokenizer.bert_tokenizer import BertTokenizer + tokenizer = BertTokenizer(vocab_file='vocab.txt') + ds = ChnSentiCorp(tokenizer=tokenizer, max_seq_len=10) + + print("first 10 dev examples") + for e in ds.get_dev_examples()[:10]: print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) + print("first 10 dev records") + for e in ds.get_dev_records()[:10]: + print(e) diff --git a/paddlehub/dataset/cmrc2018.py b/paddlehub/dataset/cmrc2018.py index 28f5113da75e6659084a416cb496c66fcbc02277..88e080d9494bbde2ff9c3c42143411c737dcc1aa 100644 --- a/paddlehub/dataset/cmrc2018.py +++ b/paddlehub/dataset/cmrc2018.py @@ -20,7 +20,7 @@ import os from paddlehub.reader import tokenization from paddlehub.common.dir import DATA_HOME from paddlehub.common.logger import logger -from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset +from paddlehub.dataset.base_nlp_dataset import MRCDataset _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/cmrc2018.tar.gz" SPIECE_UNDERLINE = '▁' @@ -62,10 +62,14 @@ class CMRC2018Example(object): return s -class CMRC2018(BaseNLPDataset): +class CMRC2018(MRCDataset): """A single set of features of data.""" - def __init__(self): + def __init__(self, + tokenizer=None, + max_seq_len=None, + max_query_len=64, + doc_stride=128): dataset_dir = os.path.join(DATA_HOME, "cmrc2018") base_path = self._download_dataset(dataset_dir, url=_DATA_URL) super(CMRC2018, self).__init__( @@ -75,6 +79,10 @@ class CMRC2018(BaseNLPDataset): test_file=None, label_file=None, label_list=None, + tokenizer=tokenizer, + max_seq_len=max_seq_len, + max_query_len=max_query_len, + doc_stride=doc_stride, ) def _read_file(self, input_file, phase=False): @@ -201,7 +209,9 @@ class CMRC2018(BaseNLPDataset): if __name__ == "__main__": print("begin") - ds = CMRC2018() + from paddlehub.tokenizer.bert_tokenizer import BertTokenizer + tokenizer = BertTokenizer(vocab_file='vocab.txt') + ds = CMRC2018(tokenizer=tokenizer, max_seq_len=50) print("train") examples = ds.get_train_examples() for index, e in enumerate(examples): diff --git a/paddlehub/dataset/dataset.py b/paddlehub/dataset/dataset.py index 2be502807c489026c05c53ad509c73ea18774363..ac2fda31cf429b7bc6e7aa049020d59380912cb7 100644 --- a/paddlehub/dataset/dataset.py +++ b/paddlehub/dataset/dataset.py @@ -121,6 +121,20 @@ class BaseDataset(object): def get_predict_examples(self): return self.predict_examples + def get_examples(self, phase): + if phase == "train": + return self.get_train_examples() + elif phase == "dev": + return self.get_dev_examples() + elif phase == "test": + return self.get_test_examples() + elif phase == "val": + return self.get_val_examples() + elif phase == "predict": + return self.get_predict_examples() + else: + raise ValueError("Invalid phase: %s" % phase) + def get_labels(self): return self.label_list diff --git a/paddlehub/dataset/drcd.py b/paddlehub/dataset/drcd.py index 1d8593c528df38f7567fdd3b2d24c5323600d9a3..da2a2fe21877b33d2bf720501930a03bbff69299 100644 --- a/paddlehub/dataset/drcd.py +++ b/paddlehub/dataset/drcd.py @@ -20,7 +20,7 @@ import os from paddlehub.reader import tokenization from paddlehub.common.dir import DATA_HOME from paddlehub.common.logger import logger -from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset +from paddlehub.dataset.base_nlp_dataset import MRCDataset _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/drcd.tar.gz" SPIECE_UNDERLINE = '▁' @@ -62,10 +62,16 @@ class DRCDExample(object): return s -class DRCD(BaseNLPDataset): +class DRCD(MRCDataset): """A single set of features of data.""" - def __init__(self): + def __init__( + self, + tokenizer=None, + max_seq_len=None, + max_query_len=64, + doc_stride=128, + ): dataset_dir = os.path.join(DATA_HOME, "drcd") base_path = self._download_dataset(dataset_dir, url=_DATA_URL) super(DRCD, self).__init__( @@ -75,6 +81,10 @@ class DRCD(BaseNLPDataset): test_file="DRCD_test.json", label_file=None, label_list=None, + tokenizer=tokenizer, + max_seq_len=max_seq_len, + max_query_len=max_query_len, + doc_stride=doc_stride, ) def _read_file(self, input_file, phase=None): @@ -176,8 +186,8 @@ class DRCD(BaseNLPDataset): cleaned_answer_text = "".join( tokenization.whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: - logger.warning((actual_text, " vs ", - cleaned_answer_text, " in ", qa)) + logger.warning("Could not find answer: '%s' vs. '%s'" % + (actual_text, cleaned_answer_text)) continue example = DRCDExample( qas_id=qas_id, @@ -191,7 +201,9 @@ class DRCD(BaseNLPDataset): if __name__ == "__main__": - ds = DRCD() + from paddlehub.tokenizer.bert_tokenizer import BertTokenizer + tokenizer = BertTokenizer(vocab_file='vocab.txt') + ds = DRCD(tokenizer=tokenizer, max_seq_len=50) print("train") examples = ds.get_train_examples() for index, e in enumerate(examples): diff --git a/paddlehub/dataset/glue.py b/paddlehub/dataset/glue.py index 9a92076e3da0d7e9ef622fa12dc7d6722c75f721..034d824caac08e0878c35b247a90503c88fe1b0d 100644 --- a/paddlehub/dataset/glue.py +++ b/paddlehub/dataset/glue.py @@ -36,7 +36,7 @@ class GLUE(BaseNLPDataset): for more information """ - def __init__(self, sub_dataset='SST-2'): + def __init__(self, sub_dataset='SST-2', tokenizer=None, max_seq_len=None): # sub_dataset : CoLA, MNLI, MRPC, QNLI, QQP, RTE, SST-2, STS-B if sub_dataset not in [ 'CoLA', 'MNLI', 'MNLI_m', 'MNLI_mm', 'MRPC', 'QNLI', 'QQP', @@ -85,7 +85,8 @@ class GLUE(BaseNLPDataset): predict_file=predict_file, label_file=None, label_list=label_list, - ) + tokenizer=tokenizer, + max_seq_len=max_seq_len) def _read_file(self, input_file, phase=None): """Reads a tab separated value file.""" @@ -165,11 +166,13 @@ class GLUE(BaseNLPDataset): if __name__ == "__main__": + from paddlehub.tokenizer.bert_tokenizer import BertTokenizer + tokenizer = BertTokenizer(vocab_file='vocab.txt') for sub_dataset in [ 'CoLA', 'MNLI', 'MRPC', 'QNLI', 'QQP', 'RTE', 'SST-2', 'STS-B' ]: print(sub_dataset) - ds = GLUE(sub_dataset=sub_dataset) + ds = GLUE(sub_dataset=sub_dataset, tokenizer=tokenizer, max_seq_len=10) for e in ds.get_train_examples()[:2]: print(e) print() @@ -182,3 +185,6 @@ if __name__ == "__main__": for e in ds.get_predict_examples()[:2]: print(e) print() + print("first 10 dev records") + for e in ds.get_dev_records()[:10]: + print(e) diff --git a/paddlehub/dataset/iflytek.py b/paddlehub/dataset/iflytek.py index db34471f95cf133d0b54182c65d019f5cf54fcc2..f6437d5f485cd94cbc1b4c3e21338192b693b643 100644 --- a/paddlehub/dataset/iflytek.py +++ b/paddlehub/dataset/iflytek.py @@ -22,13 +22,13 @@ import os from paddlehub.dataset import InputExample from paddlehub.common.dir import DATA_HOME -from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset +from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/iflytek.tar.gz" -class IFLYTEK(BaseNLPDataset): - def __init__(self): +class IFLYTEK(TextClassificationDataset): + def __init__(self, tokenizer=None, max_seq_len=None): dataset_dir = os.path.join(DATA_HOME, "iflytek") base_path = self._download_dataset(dataset_dir, url=_DATA_URL) super(IFLYTEK, self).__init__( @@ -38,7 +38,8 @@ class IFLYTEK(BaseNLPDataset): test_file="test.txt", label_file=None, label_list=[str(i) for i in range(119)], - ) + tokenizer=tokenizer, + max_seq_len=max_seq_len) def _read_file(self, input_file, phase=None): """Reads a tab separated value file.""" @@ -56,7 +57,9 @@ class IFLYTEK(BaseNLPDataset): if __name__ == "__main__": - ds = IFLYTEK() + from paddlehub.tokenizer.bert_tokenizer import BertTokenizer + tokenizer = BertTokenizer(vocab_file='vocab.txt') + ds = IFLYTEK(tokenizer=tokenizer, max_seq_len=10) print("first 10 dev") for e in ds.get_dev_examples()[:10]: print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) @@ -67,3 +70,6 @@ if __name__ == "__main__": for e in ds.get_test_examples()[:10]: print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print(ds) + print("first 10 dev records") + for e in ds.get_dev_records()[:10]: + print(e) diff --git a/paddlehub/dataset/inews.py b/paddlehub/dataset/inews.py index 82605ed89fa17d1d3ac2bf22d74f5794cf309d60..11988add68c81a07277bc25fd67ddde8b9631f3e 100644 --- a/paddlehub/dataset/inews.py +++ b/paddlehub/dataset/inews.py @@ -23,17 +23,17 @@ import csv from paddlehub.dataset import InputExample from paddlehub.common.dir import DATA_HOME -from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset +from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/inews.tar.gz" -class INews(BaseNLPDataset): +class INews(TextClassificationDataset): """ INews is a sentiment analysis dataset for Internet News """ - def __init__(self): + def __init__(self, tokenizer=None, max_seq_len=None): dataset_dir = os.path.join(DATA_HOME, "inews") base_path = self._download_dataset(dataset_dir, url=_DATA_URL) super(INews, self).__init__( @@ -43,7 +43,8 @@ class INews(BaseNLPDataset): test_file="test.txt", label_file=None, label_list=["0", "1", "2"], - ) + tokenizer=tokenizer, + max_seq_len=max_seq_len) def _read_file(self, input_file, phase=None): """Reads a tab separated value file.""" @@ -60,7 +61,10 @@ class INews(BaseNLPDataset): if __name__ == "__main__": - ds = INews() + from paddlehub.tokenizer.bert_tokenizer import BertTokenizer + + tokenizer = BertTokenizer(vocab_file='vocab.txt') + ds = INews(tokenizer=tokenizer, max_seq_len=10) print("first 10 dev") for e in ds.get_dev_examples()[:10]: print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) @@ -71,3 +75,6 @@ if __name__ == "__main__": for e in ds.get_test_examples()[:10]: print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print(ds) + print("first 10 dev records") + for e in ds.get_dev_records()[:10]: + print(e) diff --git a/paddlehub/dataset/lcqmc.py b/paddlehub/dataset/lcqmc.py index 990e4c799e6763c673cad59afd347ee653197e22..b815ceae0bffd084d51e4aa8eb1d8ce82db8d2d1 100644 --- a/paddlehub/dataset/lcqmc.py +++ b/paddlehub/dataset/lcqmc.py @@ -23,13 +23,13 @@ import csv from paddlehub.dataset import InputExample from paddlehub.common.dir import DATA_HOME -from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset +from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/lcqmc.tar.gz" -class LCQMC(BaseNLPDataset): - def __init__(self): +class LCQMC(TextClassificationDataset): + def __init__(self, tokenizer=None, max_seq_len=None): dataset_dir = os.path.join(DATA_HOME, "lcqmc") base_path = self._download_dataset(dataset_dir, url=_DATA_URL) super(LCQMC, self).__init__( @@ -39,7 +39,8 @@ class LCQMC(BaseNLPDataset): test_file="test.tsv", label_file=None, label_list=["0", "1"], - ) + tokenizer=tokenizer, + max_seq_len=max_seq_len) def _read_file(self, input_file, phase=None): """Reads a tab separated value file.""" @@ -58,7 +59,10 @@ class LCQMC(BaseNLPDataset): if __name__ == "__main__": - ds = LCQMC() + from paddlehub.tokenizer.bert_tokenizer import BertTokenizer + tokenizer = BertTokenizer(vocab_file='vocab.txt') + ds = LCQMC(tokenizer=tokenizer, max_seq_len=512) + print("first 10 dev") for e in ds.get_dev_examples()[:10]: print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) @@ -69,3 +73,7 @@ if __name__ == "__main__": for e in ds.get_test_examples()[:10]: print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print(ds) + + print("first 10 dev records") + for e in ds.get_dev_records()[:10]: + print(e) diff --git a/paddlehub/dataset/msra_ner.py b/paddlehub/dataset/msra_ner.py index 00b4974cf360ecbbf818e475667ea2644f27cb19..bec8772649de4f1e168b23c8b2a3ab95e4e31359 100644 --- a/paddlehub/dataset/msra_ner.py +++ b/paddlehub/dataset/msra_ner.py @@ -23,12 +23,12 @@ import csv from paddlehub.dataset import InputExample from paddlehub.common.dir import DATA_HOME -from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset +from paddlehub.dataset.base_nlp_dataset import SeqLabelingDataset _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/msra_ner.tar.gz" -class MSRA_NER(BaseNLPDataset): +class MSRA_NER(SeqLabelingDataset): """ A set of manually annotated Chinese word-segmentation data and specifications for training and testing a Chinese word-segmentation system @@ -36,7 +36,7 @@ class MSRA_NER(BaseNLPDataset): https://www.microsoft.com/en-us/download/details.aspx?id=52531 """ - def __init__(self): + def __init__(self, tokenizer=None, max_seq_len=None): dataset_dir = os.path.join(DATA_HOME, "msra_ner") base_path = self._download_dataset(dataset_dir, url=_DATA_URL) super(MSRA_NER, self).__init__( @@ -48,7 +48,8 @@ class MSRA_NER(BaseNLPDataset): label_list=[ "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "O" ], - ) + tokenizer=tokenizer, + max_seq_len=max_seq_len) def _read_file(self, input_file, phase=None): """Reads a tab separated value file.""" @@ -67,7 +68,9 @@ class MSRA_NER(BaseNLPDataset): if __name__ == "__main__": - ds = MSRA_NER() + from paddlehub.tokenizer.bert_tokenizer import BertTokenizer + tokenizer = BertTokenizer(vocab_file='vocab.txt') + ds = MSRA_NER(tokenizer=tokenizer, max_seq_len=30) print("first 10 dev") for e in ds.get_dev_examples()[:10]: print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) @@ -78,3 +81,6 @@ if __name__ == "__main__": for e in ds.get_test_examples()[:10]: print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print(ds) + print("first 10 dev records") + for e in ds.get_dev_records()[:10]: + print(e) diff --git a/paddlehub/dataset/nlpcc_dbqa.py b/paddlehub/dataset/nlpcc_dbqa.py index a5ae3f2d7e2322bad5b209c562a0505f981d34c5..9794c4e21301e86b5c5412717d0a4f344c6bf731 100644 --- a/paddlehub/dataset/nlpcc_dbqa.py +++ b/paddlehub/dataset/nlpcc_dbqa.py @@ -23,19 +23,19 @@ import csv from paddlehub.dataset import InputExample from paddlehub.common.dir import DATA_HOME -from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset +from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/nlpcc-dbqa.tar.gz" -class NLPCC_DBQA(BaseNLPDataset): +class NLPCC_DBQA(TextClassificationDataset): """ Please refer to http://tcci.ccf.org.cn/conference/2017/dldoc/taskgline05.pdf for more information """ - def __init__(self): + def __init__(self, tokenizer=None, max_seq_len=None): dataset_dir = os.path.join(DATA_HOME, "nlpcc-dbqa") base_path = self._download_dataset(dataset_dir, url=_DATA_URL) super(NLPCC_DBQA, self).__init__( @@ -45,7 +45,8 @@ class NLPCC_DBQA(BaseNLPDataset): test_file="test.tsv", label_file=None, label_list=["0", "1"], - ) + tokenizer=tokenizer, + max_seq_len=max_seq_len) def _read_file(self, input_file, phase=None): """Reads a tab separated value file.""" @@ -64,7 +65,9 @@ class NLPCC_DBQA(BaseNLPDataset): if __name__ == "__main__": - ds = NLPCC_DBQA() + from paddlehub.tokenizer.bert_tokenizer import BertTokenizer + tokenizer = BertTokenizer(vocab_file='vocab.txt') + ds = NLPCC_DBQA(tokenizer=tokenizer, max_seq_len=10) print("first 10 dev") for e in ds.get_dev_examples()[:10]: print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) @@ -75,3 +78,6 @@ if __name__ == "__main__": for e in ds.get_test_examples()[:10]: print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print(ds) + print("first 10 dev records") + for e in ds.get_dev_records()[:10]: + print(e) diff --git a/paddlehub/dataset/squad.py b/paddlehub/dataset/squad.py index 2d68567f214238922031aa4704ef792eab3cd950..507b87f7741ca6e21c3e4160361cfe0342e7146f 100644 --- a/paddlehub/dataset/squad.py +++ b/paddlehub/dataset/squad.py @@ -20,7 +20,7 @@ import os from paddlehub.reader import tokenization from paddlehub.common.dir import DATA_HOME from paddlehub.common.logger import logger -from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset +from paddlehub.dataset.base_nlp_dataset import MRCDataset _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/squad.tar.gz" @@ -65,10 +65,17 @@ class SquadExample(object): return s -class SQUAD(BaseNLPDataset): +class SQUAD(MRCDataset): """A single set of features of data.""" - def __init__(self, version_2_with_negative=False): + def __init__( + self, + version_2_with_negative=False, + tokenizer=None, + max_seq_len=None, + max_query_len=64, + doc_stride=128, + ): self.version_2_with_negative = version_2_with_negative if not version_2_with_negative: train_file = "train-v1.1.json" @@ -87,6 +94,10 @@ class SQUAD(BaseNLPDataset): test_file=None, label_file=None, label_list=None, + tokenizer=tokenizer, + max_seq_len=max_seq_len, + max_query_len=max_query_len, + doc_stride=doc_stride, ) def _read_file(self, input_file, phase=None): @@ -177,7 +188,10 @@ class SQUAD(BaseNLPDataset): if __name__ == "__main__": - ds = SQUAD(version_2_with_negative=True) + from paddlehub.tokenizer.bert_tokenizer import BertTokenizer + tokenizer = BertTokenizer(vocab_file='vocab.txt') + ds = SQUAD( + version_2_with_negative=True, tokenizer=tokenizer, max_seq_len=512) print("first 10 dev") for e in ds.get_dev_examples()[:2]: print(e) diff --git a/paddlehub/dataset/thucnews.py b/paddlehub/dataset/thucnews.py index 68d1665c77869025f9490268b648ded7b76e68ce..14c1917f62f4768f86a465406d0b848776883b15 100644 --- a/paddlehub/dataset/thucnews.py +++ b/paddlehub/dataset/thucnews.py @@ -22,13 +22,13 @@ import os from paddlehub.dataset import InputExample from paddlehub.common.dir import DATA_HOME -from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset +from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/thucnews.tar.gz" -class THUCNEWS(BaseNLPDataset): - def __init__(self): +class THUCNEWS(TextClassificationDataset): + def __init__(self, tokenizer=None, max_seq_len=None): dataset_dir = os.path.join(DATA_HOME, "thucnews") base_path = self._download_dataset(dataset_dir, url=_DATA_URL) super(THUCNEWS, self).__init__( @@ -38,7 +38,8 @@ class THUCNEWS(BaseNLPDataset): test_file="test.txt", label_file=None, label_list=[str(i) for i in range(14)], - ) + tokenizer=tokenizer, + max_seq_len=max_seq_len) def _read_file(self, input_file, phase=None): """Reads a tab separated value file.""" @@ -56,7 +57,9 @@ class THUCNEWS(BaseNLPDataset): if __name__ == "__main__": - ds = THUCNEWS() + from paddlehub.tokenizer.bert_tokenizer import BertTokenizer + tokenizer = BertTokenizer(vocab_file='vocab.txt') + ds = THUCNEWS(tokenizer=tokenizer, max_seq_len=10) print("first 10 dev") for e in ds.get_dev_examples()[:10]: print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) @@ -67,3 +70,6 @@ if __name__ == "__main__": for e in ds.get_test_examples()[:10]: print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print(ds) + print("first 10 dev records") + for e in ds.get_dev_records()[:10]: + print(e) diff --git a/paddlehub/dataset/tnews.py b/paddlehub/dataset/tnews.py index 72cab48a970ad3a93b8fc618639e30a88a85fddd..b37cc17ed5639ea9d81868bd184bdd90eef6168d 100644 --- a/paddlehub/dataset/tnews.py +++ b/paddlehub/dataset/tnews.py @@ -20,7 +20,8 @@ from __future__ import print_function import io import os -from paddlehub.dataset import InputExample, BaseDataset +from paddlehub.dataset import InputExample +from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset from paddlehub.common.dir import DATA_HOME _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/tnews.tar.gz" @@ -44,12 +45,12 @@ LABEL_NAME = { } -class TNews(BaseDataset): +class TNews(TextClassificationDataset): """ TNews is the chinese news classification dataset on Jinri Toutiao App. """ - def __init__(self): + def __init__(self, tokenizer=None, max_seq_len=None): dataset_dir = os.path.join(DATA_HOME, "tnews") base_path = self._download_dataset(dataset_dir, url=_DATA_URL) label_list = [ @@ -63,7 +64,8 @@ class TNews(BaseDataset): test_file="toutiao_category_test.txt", label_file=None, label_list=label_list, - ) + tokenizer=tokenizer, + max_seq_len=max_seq_len) def get_label_name(self, id): return LABEL_NAME[id] @@ -82,7 +84,9 @@ class TNews(BaseDataset): if __name__ == "__main__": - ds = TNews() + from paddlehub.tokenizer.bert_tokenizer import BertTokenizer + tokenizer = BertTokenizer(vocab_file='vocab.txt') + ds = TNews(tokenizer=tokenizer, max_seq_len=10) print("first 10 dev") for e in ds.get_dev_examples()[:10]: print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) @@ -93,3 +97,6 @@ if __name__ == "__main__": for e in ds.get_test_examples()[:10]: print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print(ds) + print("first 10 dev records") + for e in ds.get_dev_records()[:10]: + print(e) diff --git a/paddlehub/dataset/toxic.py b/paddlehub/dataset/toxic.py index 6577380247cc1a68676c68af2ffcf2c6053bcdea..dca4bf288ae285d1d40e361a40027e1c1b18403d 100644 --- a/paddlehub/dataset/toxic.py +++ b/paddlehub/dataset/toxic.py @@ -22,18 +22,18 @@ import pandas as pd from paddlehub.dataset import InputExample from paddlehub.common.dir import DATA_HOME -from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset +from paddlehub.dataset.base_nlp_dataset import MultiLabelDataset _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/toxic.tar.gz" -class Toxic(BaseNLPDataset): +class Toxic(MultiLabelDataset): """ The kaggle Toxic dataset: https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge """ - def __init__(self): + def __init__(self, tokenizer=None, max_seq_len=None): dataset_dir = os.path.join(DATA_HOME, "toxic") base_path = self._download_dataset(dataset_dir, url=_DATA_URL) label_list = [ @@ -47,7 +47,8 @@ class Toxic(BaseNLPDataset): test_file="test.csv", label_file=None, label_list=label_list, - ) + tokenizer=tokenizer, + max_seq_len=max_seq_len) def _read_file(self, input_file, phase=None): """Reads a tab separated value file.""" @@ -64,7 +65,10 @@ class Toxic(BaseNLPDataset): if __name__ == "__main__": - ds = Toxic() + from paddlehub.tokenizer.bert_tokenizer import BertTokenizer + + tokenizer = BertTokenizer(vocab_file='vocab.txt') + ds = Toxic(tokenizer=tokenizer, max_seq_len=10) print("first 10 dev") for e in ds.get_dev_examples()[:10]: print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) @@ -75,3 +79,6 @@ if __name__ == "__main__": for e in ds.get_test_examples()[:10]: print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print(ds) + print("first 10 dev records") + for e in ds.get_dev_records()[:10]: + print(e) diff --git a/paddlehub/dataset/xnli.py b/paddlehub/dataset/xnli.py index 1f377e9d35d787e9ebc254a895e6536c52b4fd23..3e994f3641421d2bf2a486df5f69667d9d26a6d1 100644 --- a/paddlehub/dataset/xnli.py +++ b/paddlehub/dataset/xnli.py @@ -25,19 +25,19 @@ import csv from paddlehub.dataset import InputExample from paddlehub.common.dir import DATA_HOME -from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset +from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/XNLI-lan.tar.gz" -class XNLI(BaseNLPDataset): +class XNLI(TextClassificationDataset): """ Please refer to https://arxiv.org/pdf/1809.05053.pdf for more information """ - def __init__(self, language='zh'): + def __init__(self, language='zh', tokenizer=None, max_seq_len=None): if language not in [ "ar", "bg", "de", "el", "en", "es", "fr", "hi", "ru", "sw", "th", "tr", "ur", "vi", "zh" @@ -55,6 +55,8 @@ class XNLI(BaseNLPDataset): test_file="%s_test.tsv" % language, label_file=None, label_list=["neutral", "contradiction", "entailment"], + tokenizer=tokenizer, + max_seq_len=max_seq_len, ) def _read_file(self, input_file, phase=None): @@ -74,7 +76,10 @@ class XNLI(BaseNLPDataset): if __name__ == "__main__": - ds = XNLI() + from paddlehub.tokenizer.bert_tokenizer import BertTokenizer + tokenizer = BertTokenizer(vocab_file='vocab.txt') + + ds = XNLI(tokenizer=tokenizer, max_seq_len=20) print("first 10 dev") for e in ds.get_dev_examples()[:10]: print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) diff --git a/paddlehub/finetune/strategy.py b/paddlehub/finetune/strategy.py index 080017d23ed7adf18b9fee278cee9eefb7ba331c..7f522c22c0213085d1fd75f6db70299a6f1ac46b 100644 --- a/paddlehub/finetune/strategy.py +++ b/paddlehub/finetune/strategy.py @@ -167,7 +167,7 @@ class DefaultStrategy(object): self.optimizer = fluid.optimizer.Adam( learning_rate=self.learning_rate, **kwargs) - def execute(self, loss, data_reader, config, dev_count): + def execute(self, loss, max_train_steps): if self.optimizer is not None: self.optimizer.minimize(loss) else: @@ -456,26 +456,9 @@ class CombinedStrategy(DefaultStrategy): "weight_decay"] * scheduled_lr fluid.layers.assign(output=param, input=updated_param) - def execute(self, loss, data_reader, config, dev_count): + def execute(self, loss, max_train_steps): # base information self.main_program = loss.block.program - self.config = config - - # self.num_examples = {'train': -1, 'dev': -1, 'test': -1} before data_generator - data_reader.data_generator( - batch_size=config.batch_size, phase='train', shuffle=True) - num_train_examples = data_reader.num_examples['train'] - - max_train_steps = config.num_epoch * num_train_examples // config.batch_size // dev_count - - try: - # nlp_reader - _in_tokens = data_reader.in_tokens - if _in_tokens: - max_train_steps *= data_reader.max_seq_len - except: - # cv_reader without .in_tokens and .max_seq_len - pass if self.scheduler["discriminative"]["blocks"] > 0 or self.scheduler[ "gradual_unfreeze"]["blocks"] > 0: @@ -494,8 +477,7 @@ class CombinedStrategy(DefaultStrategy): self.regularization_handler(loss, scheduled_lr) logger.info(self.__str__()) - - return scheduled_lr, max_train_steps + return scheduled_lr def exclude_from_weight_decay(self, name): if name.find("layer_norm") > -1: diff --git a/paddlehub/finetune/task/base_task.py b/paddlehub/finetune/task/base_task.py index 910364981236cbfb558450317f089dec08e2bba0..287a25d70162a365a872849ebd3caffd520b322d 100644 --- a/paddlehub/finetune/task/base_task.py +++ b/paddlehub/finetune/task/base_task.py @@ -35,6 +35,7 @@ import paddle.fluid as fluid from visualdl import LogWriter import paddlehub as hub +from paddlehub.reader.nlp_reader import BaseNLPReader from paddlehub.common.paddle_helper import dtype_map, clone_program from paddlehub.common.utils import mkdir from paddlehub.common.dir import tmp_dir @@ -84,7 +85,7 @@ class RunEnv(object): self.start_program = None self.main_program_compiled = None self.py_reader = None - self.reader = None + self.generator = None self.loss = None self.labels = None self.metrics = None @@ -260,8 +261,8 @@ class BaseTask(object): BaseTask is the base class of all the task. It will complete the building of all the running environment. Args: - feed_list (list): the inputs name - data_reader (object): data reader for the task + feed_list (list): the inputs name. Deprecated in paddlehub v1.8. + data_reader (object): data reader for the task. Deprecated in paddlehub v1.8. main_program (object): the customized main_program, default None startup_program (object): the customized startup_program, default None config (object): the config for the task, default None @@ -269,16 +270,13 @@ class BaseTask(object): """ def __init__(self, - feed_list, - data_reader, + dataset=None, + feed_list=None, + data_reader=None, main_program=None, startup_program=None, config=None, metrics_choices="default"): - # base item - self._base_data_reader = data_reader - self._base_feed_list = feed_list - # metrics item self.best_score = -999 if metrics_choices == "default": @@ -293,7 +291,6 @@ class BaseTask(object): if main_program is None: self._base_main_program = clone_program( fluid.default_main_program(), for_test=False) - else: self._base_main_program = clone_program( main_program, for_test=False) @@ -344,6 +341,23 @@ class BaseTask(object): # set default phase self.enter_phase("train") + self.dataset = dataset + if dataset: + self._label_list = dataset.get_labels() + # Compatible code for usage deprecated in paddlehub v1.8. + self._base_data_reader = data_reader + self._base_feed_list = feed_list + + if isinstance(data_reader, BaseNLPReader): + self._compatible_mode = True + logger.warning( + "PaddleHub v1.8 has deprecated the reader and feed_list parameters in the nlp Task. We provided an easier usage, " + "in which you can use your tokenizer to preprocess dataset and run task in a clear flow. " + "New demo see https://github.com/PaddlePaddle/PaddleHub/blob/release/v1.8/demo/text_classification/text_cls.py" + ) + else: + self._compatible_mode = False + @contextlib.contextmanager def phase_guard(self, phase): self.enter_phase(phase) @@ -420,9 +434,29 @@ class BaseTask(object): with fluid.program_guard(self.env.main_program, self._base_startup_program): with fluid.unique_name.guard(self.env.UNG): - self.scheduled_lr, self.max_train_steps = self.config.strategy.execute( - self.loss, self._base_data_reader, self.config, - self.device_count) + if self._compatible_mode: + # This branch is compatible code for usage deprecated in paddlehub v1.8. + self._base_data_reader.data_generator( + batch_size=self.config.batch_size, + phase='train', + shuffle=True) + num_train_examples = self._base_data_reader.num_examples[ + 'train'] + try: + # nlp_reader + _in_tokens = self._base_data_reader.in_tokens + if _in_tokens: + num_train_examples *= self._base_data_reader.max_seq_len + except: + # cv_reader without .in_tokens and .max_seq_len + pass + else: + num_train_examples = len( + self.dataset.get_train_records()) + + self.max_train_steps = self.config.num_epoch * num_train_examples // self.config.batch_size // self.device_count + self.scheduled_lr = self.config.strategy.execute( + self.loss, self.max_train_steps) if self.is_train_phase: loss_name = self.env.loss.name @@ -529,17 +563,40 @@ class BaseTask(object): return self.main_program @property - def reader(self): - if self.is_predict_phase: - data = self._predict_data + def generator(self): + if self._compatible_mode: + if self.is_predict_phase: + data = self._predict_data + else: + data = None + self.env.generator = self._base_data_reader.data_generator( + batch_size=self.config.batch_size, + phase=self.phase, + data=data, + return_list=not self.config.use_pyreader) else: - data = None - self.env.reader = self._base_data_reader.data_generator( - batch_size=self.config.batch_size, - phase=self.phase, - data=data, - return_list=not self.config.use_pyreader) - return self.env.reader + + def data_generator(records): + def wrapper(): + for record in records: + values = [] + for feed_name in self.feed_list: + values.append(record[feed_name]) + yield values + + return wrapper + + if self.is_predict_phase: + records = self._predict_data + else: + if self.is_train_phase: + shuffle = True + else: + shuffle = False + records = self.dataset.get_records( + phase=self.phase, shuffle=shuffle) + self.env.generator = data_generator(records) + return self.env.generator @property def loss(self): @@ -580,13 +637,30 @@ class BaseTask(object): @property def feed_list(self): - feed_list = [varname for varname in self._base_feed_list] - if self.is_train_phase or self.is_test_phase: - feed_list += [label.name for label in self.labels] + if self._compatible_mode: + feed_list = [varname for varname in self._base_feed_list] + if self.is_train_phase or self.is_test_phase: + feed_list += [label.name for label in self.labels] + else: + if not self.env.is_inititalized: + self._build_env() + + if self._predict_data: + feed_list = list(self._predict_data[0].keys()) + else: + feed_list = self.dataset.get_feed_list(self.phase) + + feed_list = [ + feed_name for feed_name in feed_list + if feed_name in self.main_program.global_block().vars + ] return feed_list @property def feed_var_list(self): + if not self.env.is_inititalized: + self._build_env() + vars = self.main_program.global_block().vars return [vars[varname] for varname in self.feed_list] @@ -890,13 +964,20 @@ class BaseTask(object): self.env.current_epoch += 1 # Final evaluation - if self._base_data_reader.get_dev_examples() != []: + if self._compatible_mode: + dev_examples = self._base_data_reader.get_dev_examples() + test_examples = self._base_data_reader.get_test_examples() + else: + dev_examples = self.dataset.get_dev_examples() + test_examples = self.dataset.get_test_examples() + if dev_examples != []: # Warning: DO NOT use self.eval(phase="dev", load_best_model=True) during training. # It will cause trainer unable to continue training from checkpoint after eval. # More important, The model should evaluate current performance during training. self.eval(phase="dev") - if self._base_data_reader.get_test_examples() != []: + if test_examples != []: self.eval(phase="test", load_best_model=True) + # Save checkpoint after finetune self.save_checkpoint() @@ -957,17 +1038,41 @@ class BaseTask(object): global_run_states = [] period_run_states = [] - for run_step, batch in enumerate(self.reader(), start=1): + feed_var_shape = [] + feed_var_type = [] + for var in self.feed_var_list: + feed_var_shape.append(var.shape) + feed_var_type.append(dtype_map[var.dtype]) + + if self._compatible_mode: + data_reader = self.generator + else: + data_reader = paddle.batch( + self.generator, batch_size=self.config.batch_size) + for batch in data_reader(): + if self._compatible_mode and not self.config.use_pyreader: + # if not use pyreader, the nlp_reader return [batch] + batch = batch[0] + step_run_state = RunState(len(self.fetch_list)) step_run_state.run_step = 1 num_batch_examples = len(batch) - if not self.config.use_pyreader: - # if use pyreader, the nlp_reader return [batch] - batch = batch[0] - - batch = [fluid.core.PaddleTensor(data) for data in batch] - fetch_result = self._predictor.run(batch) + # Preocessing data to the suitable shape and type for the model + processed_batch = [[] for i in range(len(self.feed_list))] + if self._compatible_mode: + processed_batch = batch + else: + for sample in batch: + for i, data in enumerate(sample): + processed_batch[i].append(data) + tensor_batch = [[] for i in range(len(self.feed_list))] + for i in range(len(processed_batch)): + processed_batch[i] = np.array(processed_batch[i]).reshape( + feed_var_shape[i]).astype(feed_var_type[i]) + tensor_batch[i] = fluid.core.PaddleTensor(processed_batch[i]) + + fetch_result = self._predictor.run(tensor_batch) for index, result in enumerate(fetch_result): step_run_state.run_results[index] = result.as_ndarray() step_run_state.run_examples += num_batch_examples @@ -978,18 +1083,23 @@ class BaseTask(object): global_run_states += period_run_states return global_run_states - def predict(self, - data, - load_best_model=True, - return_result=False, - accelerate_mode=True): + def predict( + self, + data=None, + label_list=None, + load_best_model=True, + return_result=False, + accelerate_mode=True, + ): """ make prediction for the input data. Args: - data (list): the data will be predicted. + data (list): the data will be predicted. Its element should be a record when the task is initialized without data_reader param, + or a plaintext string list when the task is initialized with data_reader param (deprecated in paddlehub v1.8). + label_list (list): the label list, used to proprocess the output. load_best_model (bool): load the best model or not - return_result (bool): return a readable result or just the raw run result + return_result (bool): return a readable result or just the raw run result. Always True when the task is not initialized with data_reader param. accelerate_mode (bool): use high-performance predictor or not Returns: @@ -1005,6 +1115,7 @@ class BaseTask(object): with self.phase_guard(phase="predict"): self._predict_data = data + self._label_list = label_list self._predict_start_event() if load_best_model: @@ -1020,7 +1131,7 @@ class BaseTask(object): self._predict_end_event(run_states) self._predict_data = None - if return_result: + if return_result or not self._compatible_mode: return self._postprocessing(run_states) return run_states @@ -1057,20 +1168,34 @@ class BaseTask(object): capacity=64, use_double_buffer=True, iterable=True) - data_reader = data_loader.set_batch_generator( - self.reader, places=self.places) + if self._compatible_mode: + data_reader = data_loader.set_batch_generator( + self.generator, places=self.places) + else: + data_reader = data_loader.set_sample_generator( + self.generator, + places=self.places, + batch_size=self.config.batch_size, + drop_last=True) else: data_feeder = fluid.DataFeeder( feed_list=self.feed_list, place=self.place) - data_reader = data_feeder.decorate_reader( - self.reader, - multi_devices=self.config.use_data_parallel, - drop_last=True) + if self._compatible_mode: + data_reader = data_feeder.decorate_reader( + self.generator, + multi_devices=self.config.use_data_parallel, + drop_last=True) + else: + data_reader = data_feeder.decorate_reader( + paddle.batch( + self.generator, batch_size=self.config.batch_size), + multi_devices=self.config.use_data_parallel, + drop_last=True) global_run_states = [] period_run_states = [] - for run_step, batch in enumerate(data_reader(), start=1): + for batch in data_reader(): step_run_state = RunState(len(self.fetch_list)) step_run_state.run_step = 1 num_batch_examples = len(batch) @@ -1107,6 +1232,5 @@ class BaseTask(object): return global_run_states def __repr__(self): - return "Task: %s with metrics_choices: %s, reader: %s, %s" % ( - self.__class__.__name__, self.metrics_choices, - self._base_data_reader.__class__.__name__, self.config) + return "Task: %s with metrics_choices: %s, %s" % ( + self.__class__.__name__, self.metrics_choices, self.config) diff --git a/paddlehub/finetune/task/classifier_task.py b/paddlehub/finetune/task/classifier_task.py index b137afdc750737a6927a981014817d6b6383c36a..5bc2c40993e0b973388ccb5692992ba3dfade249 100644 --- a/paddlehub/finetune/task/classifier_task.py +++ b/paddlehub/finetune/task/classifier_task.py @@ -19,13 +19,12 @@ from __future__ import print_function from collections import OrderedDict import numpy as np -import paddle import paddle.fluid as fluid import time from paddlehub.common.logger import logger from paddlehub.finetune.evaluate import calculate_f1_np, matthews_corrcoef -from paddlehub.reader.nlp_reader import ClassifyReader +from paddlehub.reader.nlp_reader import ClassifyReader, LACClassifyReader import paddlehub.network as net from .base_task import BaseTask @@ -35,8 +34,9 @@ class ClassifierTask(BaseTask): def __init__(self, feature, num_classes, - feed_list, - data_reader, + dataset=None, + feed_list=None, + data_reader=None, startup_program=None, config=None, hidden_units=None, @@ -46,6 +46,7 @@ class ClassifierTask(BaseTask): main_program = feature.block.program super(ClassifierTask, self).__init__( + dataset=dataset, data_reader=data_reader, main_program=main_program, feed_list=feed_list, @@ -109,7 +110,7 @@ class ClassifierTask(BaseTask): run_examples += run_state.run_examples run_step += run_state.run_step loss_sum += np.mean( - run_state.run_results[-2]) * run_state.run_examples + run_state.run_results[-1]) * run_state.run_examples acc_sum += np.mean( run_state.run_results[2]) * run_state.run_examples np_labels = run_state.run_results[0] @@ -140,20 +141,28 @@ class ClassifierTask(BaseTask): return scores, avg_loss, run_speed def _postprocessing(self, run_states): - try: - id2label = { - val: key - for key, val in self._base_data_reader.label_map.items() - } - except: - raise Exception( - "ImageClassificationDataset does not support postprocessing, please use BaseCVDataset instead" - ) + if self._compatible_mode: + try: + label_list = list(self._base_data_reader.label_map.keys()) + except: + raise Exception( + "ImageClassificationDataset does not support postprocessing, please use BaseCVDataset instead" + ) + else: + if self._label_list: + label_list = self._label_list + else: + logger.warning( + "Fail to postprocess the predict output. Please set label_list parameter in predict function or initialize the task with dataset parameter." + ) + return run_states results = [] for batch_state in run_states: batch_result = batch_state.run_results batch_infer = np.argmax(batch_result[0], axis=1) - results += [id2label[sample_infer] for sample_infer in batch_infer] + results += [ + label_list[sample_infer] for sample_infer in batch_infer + ] return results @@ -166,22 +175,24 @@ class TextClassifierTask(ClassifierTask): It will use full-connect layer with softmax activation function to classify texts. """ - def __init__(self, - num_classes, - feed_list, - data_reader, - feature=None, - token_feature=None, - network=None, - startup_program=None, - config=None, - hidden_units=None, - metrics_choices="default"): + def __init__( + self, + num_classes, + dataset=None, + feed_list=None, # Deprecated + data_reader=None, # Deprecated + feature=None, + token_feature=None, + network=None, + startup_program=None, + config=None, + hidden_units=None, + metrics_choices="default"): """ Args: num_classes: total labels of the text classification task. - feed_list(list): the variable name that will be feeded to the main program - data_reader(object): data reader for the task. It must be one of ClassifyReader and LACClassifyReader. + feed_list(list): the variable name that will be feeded to the main program, Deprecated in paddlehub v1.8. + data_reader(object): data reader for the task. It must be one of ClassifyReader and LACClassifyReader, Deprecated in paddlehub v1.8.. feature(Variable): the `feature` will be used to classify texts. It must be the sentence-level feature, shape as [-1, emb_size]. `Token_feature` and `feature` couldn't be setted at the same time. One of them must be setted as not None. Default None. token_feature(Variable): the `feature` will be used to connect the pre-defined network. It must be the token-level feature, shape as [-1, seq_len, emb_size]. Default None. network(str): the pre-defined network. Choices: 'bilstm', 'bow', 'cnn', 'dpcnn', 'gru' and 'lstm'. Default None. If network is setted, then `token_feature` must be setted and `feature` must be None. @@ -193,12 +204,12 @@ class TextClassifierTask(ClassifierTask): """ if (not feature) and (not token_feature): logger.error( - 'Both token_feature and feature are None, one of them must be setted.' + 'Both token_feature and feature are None, one of them must be set.' ) exit(1) elif feature and token_feature: logger.error( - 'Both token_feature and feature are setted. One should be setted, the other should be None.' + 'Both token_feature and feature are set. One should be set, the other should be None.' ) exit(1) @@ -226,6 +237,7 @@ class TextClassifierTask(ClassifierTask): metrics_choices = ["acc"] super(TextClassifierTask, self).__init__( + dataset=dataset, data_reader=data_reader, feature=feature if feature else token_feature, num_classes=num_classes, @@ -236,16 +248,14 @@ class TextClassifierTask(ClassifierTask): metrics_choices=metrics_choices) def _build_net(self): - if isinstance(self._base_data_reader, ClassifyReader): - # ClassifyReader will return the seqence length of an input text + if not isinstance(self._base_data_reader, LACClassifyReader): + # LACClassifyReader wont return the seqence length, while Dataset with tokenizer and ClassifyReader will. self.seq_len = fluid.layers.data( name="seq_len", shape=[1], dtype='int64', lod_level=0) self.seq_len_used = fluid.layers.squeeze(self.seq_len, axes=[1]) - # unpad the token_feature unpad_feature = fluid.layers.sequence_unpad( self.feature, length=self.seq_len_used) - if self.network: # add pre-defined net net_func = getattr(net.classification, self.network) @@ -254,9 +264,14 @@ class TextClassifierTask(ClassifierTask): cls_feats = net_func( self.feature, emb_dim=self.feature.shape[-1]) else: - cls_feats = net_func(unpad_feature) - logger.info( - "%s has been added in the TextClassifierTask!" % self.network) + if self._compatible_mode and isinstance(self._base_data_reader, + LACClassifyReader): + cls_feats = net_func(self.feature) + else: + cls_feats = net_func(unpad_feature) + if self.is_train_phase: + logger.info("%s has been added in the TextClassifierTask!" % + self.network) else: # not use pre-defined net but to use fc net cls_feats = fluid.layers.dropout( @@ -286,12 +301,15 @@ class TextClassifierTask(ClassifierTask): @property def feed_list(self): - feed_list = [varname for varname in self._base_feed_list] - if isinstance(self._base_data_reader, ClassifyReader): - # ClassifyReader will return the seqence length of an input text - feed_list += [self.seq_len.name] - if self.is_train_phase or self.is_test_phase: - feed_list += [self.labels[0].name] + if self._compatible_mode: + feed_list = [varname for varname in self._base_feed_list] + if isinstance(self._base_data_reader, ClassifyReader): + # ClassifyReader will return the seqence length of an input text + feed_list += [self.seq_len.name] + if self.is_train_phase or self.is_test_phase: + feed_list += [self.labels[0].name] + else: + feed_list = super(TextClassifierTask, self).feed_list return feed_list @property @@ -303,11 +321,10 @@ class TextClassifierTask(ClassifierTask): ] else: # predict phase - fetch_list = [self.outputs[0].name] - - if isinstance(self._base_data_reader, ClassifyReader): - # to avoid save_inference_model to prune seq_len variable - fetch_list += [self.seq_len.name] + if isinstance(self._base_data_reader, LACClassifyReader): + fetch_list = [self.outputs[0].name] + else: + fetch_list = [self.outputs[0].name, self.seq_len.name] return fetch_list @@ -316,8 +333,9 @@ class MultiLabelClassifierTask(ClassifierTask): def __init__(self, feature, num_classes, - feed_list, - data_reader, + dataset=None, + feed_list=None, + data_reader=None, startup_program=None, config=None, hidden_units=None, @@ -325,8 +343,8 @@ class MultiLabelClassifierTask(ClassifierTask): if metrics_choices == "default": metrics_choices = ["auc"] - main_program = feature.block.program super(MultiLabelClassifierTask, self).__init__( + dataset=dataset, data_reader=data_reader, feature=feature, num_classes=num_classes, @@ -335,7 +353,10 @@ class MultiLabelClassifierTask(ClassifierTask): config=config, hidden_units=hidden_units, metrics_choices=metrics_choices) - self.class_name = list(data_reader.label_map.keys()) + if self._compatible_mode: + self.class_name = list(data_reader.label_map.keys()) + else: + self.class_name = self._label_list def _build_net(self): cls_feats = fluid.layers.dropout( @@ -428,13 +449,22 @@ class MultiLabelClassifierTask(ClassifierTask): def _postprocessing(self, run_states): results = [] - label_list = list(self._base_data_reader.label_map.keys()) + if self._compatible_mode: + label_list = list(self._base_data_reader.label_map.keys()) + else: + if self._label_list: + label_list = self._label_list + else: + logger.warning( + "Fail to postprocess the predict output. Please set label_list parameter in predict function or initialize the task with dataset parameter." + ) + return run_states + for batch_state in run_states: batch_result = batch_state.run_results for sample_id in range(len(batch_result[0])): sample_result = [] - for category_id in range( - self._base_data_reader.dataset.num_labels): + for category_id in range(len(label_list)): sample_category_prob = batch_result[category_id][sample_id] sample_category_value = np.argmax(sample_category_prob) sample_result.append( diff --git a/paddlehub/finetune/task/reading_comprehension_task.py b/paddlehub/finetune/task/reading_comprehension_task.py index cb01f0eb7075915d78f8835b08c70ff82cef5959..bc68a718ef6f254cf0162c8d113ee9e0230fceac 100644 --- a/paddlehub/finetune/task/reading_comprehension_task.py +++ b/paddlehub/finetune/task/reading_comprehension_task.py @@ -18,23 +18,22 @@ from __future__ import division from __future__ import print_function import time -import os import collections import math import six import json - -from collections import OrderedDict - import io + +from tqdm import tqdm import numpy as np import paddle.fluid as fluid -from .base_task import BaseTask + from paddlehub.common.logger import logger from paddlehub.reader import tokenization from paddlehub.finetune.evaluator import squad1_evaluate from paddlehub.finetune.evaluator import squad2_evaluate from paddlehub.finetune.evaluator import cmrc2018_evaluate +from .base_task import BaseTask def _get_best_indexes(logits, n_best_size): @@ -193,183 +192,189 @@ def get_predictions(all_examples, all_features, all_results, n_best_size, all_nbest_json = collections.OrderedDict() scores_diff_json = collections.OrderedDict() - for (example_index, example) in enumerate(all_examples): - features = example_index_to_features[example_index] - - prelim_predictions = [] - # keep track of the minimum score of null start+end of position 0 - score_null = 1000000 # large and positive - min_null_feature_index = 0 # the paragraph slice with min mull score - null_start_logit = 0 # the start logit at the slice with min null score - null_end_logit = 0 # the end logit at the slice with min null score - for (feature_index, feature) in enumerate(features): - if feature.unique_id not in unique_id_to_result: - logger.info( - "As using multidevice, the last one batch is so small that the feature %s in the last batch is discarded " - % feature.unique_id) - continue - result = unique_id_to_result[feature.unique_id] - start_indexes = _get_best_indexes(result.start_logits, n_best_size) - end_indexes = _get_best_indexes(result.end_logits, n_best_size) + logger.info("Post processing...") + with tqdm(total=len(all_examples)) as process_bar: + for (example_index, example) in enumerate(all_examples): + features = example_index_to_features[example_index] + + prelim_predictions = [] + # keep track of the minimum score of null start+end of position 0 + score_null = 1000000 # large and positive + min_null_feature_index = 0 # the paragraph slice with min mull score + null_start_logit = 0 # the start logit at the slice with min null score + null_end_logit = 0 # the end logit at the slice with min null score + for (feature_index, feature) in enumerate(features): + if feature.unique_id not in unique_id_to_result: + logger.info( + "As using multidevice, the last one batch is so small that the feature %s in the last batch is discarded " + % feature.unique_id) + continue + result = unique_id_to_result[feature.unique_id] + start_indexes = _get_best_indexes(result.start_logits, + n_best_size) + end_indexes = _get_best_indexes(result.end_logits, n_best_size) + + # if we could have irrelevant answers, get the min score of irrelevant + if version_2_with_negative: + feature_null_score = result.start_logits[ + 0] + result.end_logits[0] + if feature_null_score < score_null: + score_null = feature_null_score + min_null_feature_index = feature_index + null_start_logit = result.start_logits[0] + null_end_logit = result.end_logits[0] + + for start_index in start_indexes: + for end_index in end_indexes: + # We could hypothetically create invalid predictions, e.g., predict + # that the start of the span is in the question. We throw out all + # invalid predictions. + if start_index >= len(feature.tokens): + continue + if end_index >= len(feature.tokens): + continue + if start_index not in feature.token_to_orig_map: + continue + if end_index not in feature.token_to_orig_map: + continue + if not feature.token_is_max_context.get( + start_index, False): + continue + if end_index < start_index: + continue + length = end_index - start_index + 1 + if length > max_answer_length: + continue + prelim_predictions.append( + _PrelimPrediction( + feature_index=feature_index, + start_index=start_index, + end_index=end_index, + start_logit=result.start_logits[start_index], + end_logit=result.end_logits[end_index])) - # if we could have irrelevant answers, get the min score of irrelevant if version_2_with_negative: - feature_null_score = result.start_logits[0] + result.end_logits[ - 0] - if feature_null_score < score_null: - score_null = feature_null_score - min_null_feature_index = feature_index - null_start_logit = result.start_logits[0] - null_end_logit = result.end_logits[0] - - for start_index in start_indexes: - for end_index in end_indexes: - # We could hypothetically create invalid predictions, e.g., predict - # that the start of the span is in the question. We throw out all - # invalid predictions. - if start_index >= len(feature.tokens): - continue - if end_index >= len(feature.tokens): - continue - if start_index not in feature.token_to_orig_map: - continue - if end_index not in feature.token_to_orig_map: - continue - if not feature.token_is_max_context.get(start_index, False): - continue - if end_index < start_index: - continue - length = end_index - start_index + 1 - if length > max_answer_length: + prelim_predictions.append( + _PrelimPrediction( + feature_index=min_null_feature_index, + start_index=0, + end_index=0, + start_logit=null_start_logit, + end_logit=null_end_logit)) + prelim_predictions = sorted( + prelim_predictions, + key=lambda x: (x.start_logit + x.end_logit), + reverse=True) + + seen_predictions = {} + nbest = [] + if not prelim_predictions: + logger.warning(("not prelim_predictions:", example.qas_id)) + for pred in prelim_predictions: + if len(nbest) >= n_best_size: + break + feature = features[pred.feature_index] + if pred.start_index > 0: # this is a non-null prediction + tok_tokens = feature.tokens[pred.start_index:( + pred.end_index + 1)] + orig_doc_start = feature.token_to_orig_map[pred.start_index] + orig_doc_end = feature.token_to_orig_map[pred.end_index] + orig_tokens = example.doc_tokens[orig_doc_start:( + orig_doc_end + 1)] + if is_english: + tok_text = " ".join(tok_tokens) + else: + tok_text = "".join(tok_tokens) + # De-tokenize WordPieces that have been split off. + tok_text = tok_text.replace(" ##", "") + tok_text = tok_text.replace("##", "") + + # Clean whitespace + tok_text = tok_text.strip() + tok_text = " ".join(tok_text.split()) + if is_english: + orig_text = " ".join(orig_tokens) + else: + orig_text = "".join(orig_tokens) + + final_text = get_final_text(tok_text, orig_text, + do_lower_case, is_english) + if final_text in seen_predictions: continue - prelim_predictions.append( - _PrelimPrediction( - feature_index=feature_index, - start_index=start_index, - end_index=end_index, - start_logit=result.start_logits[start_index], - end_logit=result.end_logits[end_index])) - - if version_2_with_negative: - prelim_predictions.append( - _PrelimPrediction( - feature_index=min_null_feature_index, - start_index=0, - end_index=0, - start_logit=null_start_logit, - end_logit=null_end_logit)) - prelim_predictions = sorted( - prelim_predictions, - key=lambda x: (x.start_logit + x.end_logit), - reverse=True) - - seen_predictions = {} - nbest = [] - if not prelim_predictions: - logger.warning(("not prelim_predictions:", example.qas_id)) - for pred in prelim_predictions: - if len(nbest) >= n_best_size: - break - feature = features[pred.feature_index] - if pred.start_index > 0: # this is a non-null prediction - tok_tokens = feature.tokens[pred.start_index:( - pred.end_index + 1)] - orig_doc_start = feature.token_to_orig_map[pred.start_index] - orig_doc_end = feature.token_to_orig_map[pred.end_index] - orig_tokens = example.doc_tokens[orig_doc_start:( - orig_doc_end + 1)] - if is_english: - tok_text = " ".join(tok_tokens) - else: - tok_text = "".join(tok_tokens) - # De-tokenize WordPieces that have been split off. - tok_text = tok_text.replace(" ##", "") - tok_text = tok_text.replace("##", "") - - # Clean whitespace - tok_text = tok_text.strip() - tok_text = " ".join(tok_text.split()) - if is_english: - orig_text = " ".join(orig_tokens) + + seen_predictions[final_text] = True else: - orig_text = "".join(orig_tokens) + final_text = "" + seen_predictions[final_text] = True - final_text = get_final_text(tok_text, orig_text, do_lower_case, - is_english) - if final_text in seen_predictions: - continue + nbest.append( + _NbestPrediction( + text=final_text, + start_logit=pred.start_logit, + end_logit=pred.end_logit)) - seen_predictions[final_text] = True - else: - final_text = "" - seen_predictions[final_text] = True - - nbest.append( - _NbestPrediction( - text=final_text, - start_logit=pred.start_logit, - end_logit=pred.end_logit)) - - # if we didn't include the empty option in the n-best, include it - if version_2_with_negative: - if "" not in seen_predictions: + # if we didn't include the empty option in the n-best, include it + if version_2_with_negative: + if "" not in seen_predictions: + nbest.append( + _NbestPrediction( + text="", + start_logit=null_start_logit, + end_logit=null_end_logit)) + # In very rare edge cases we could have no valid predictions. So we + # just create a nonce prediction in this case to avoid failure. + if not nbest: nbest.append( _NbestPrediction( - text="", - start_logit=null_start_logit, - end_logit=null_end_logit)) - # In very rare edge cases we could have no valid predictions. So we - # just create a nonce prediction in this case to avoid failure. - if not nbest: - nbest.append( - _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) - - assert len(nbest) >= 1 - - total_scores = [] - best_non_null_entry = None - for entry in nbest: - total_scores.append(entry.start_logit + entry.end_logit) - if not best_non_null_entry: - if entry.text: - best_non_null_entry = entry - - probs = _compute_softmax(total_scores) - - nbest_json = [] - for (i, entry) in enumerate(nbest): - output = collections.OrderedDict() - output["text"] = entry.text - output["probability"] = probs[i] - output["start_logit"] = entry.start_logit - output["end_logit"] = entry.end_logit - nbest_json.append(output) - - assert len(nbest_json) >= 1 - - if not version_2_with_negative: - all_predictions[example.qas_id] = nbest_json[0]["text"] - else: - # predict "" iff the null score - the score of best non-null > threshold - score_diff = score_null - if best_non_null_entry: - score_diff -= best_non_null_entry.start_logit + best_non_null_entry.end_logit - scores_diff_json[example.qas_id] = score_diff - if score_diff > null_score_diff_threshold: - all_predictions[example.qas_id] = "" - else: - all_predictions[example.qas_id] = best_non_null_entry.text + text="empty", start_logit=0.0, end_logit=0.0)) + + assert len(nbest) >= 1 - all_nbest_json[example.qas_id] = nbest_json + total_scores = [] + best_non_null_entry = None + for entry in nbest: + total_scores.append(entry.start_logit + entry.end_logit) + if not best_non_null_entry: + if entry.text: + best_non_null_entry = entry + + probs = _compute_softmax(total_scores) + + nbest_json = [] + for (i, entry) in enumerate(nbest): + output = collections.OrderedDict() + output["text"] = entry.text + output["probability"] = probs[i] + output["start_logit"] = entry.start_logit + output["end_logit"] = entry.end_logit + nbest_json.append(output) + + assert len(nbest_json) >= 1 + + if not version_2_with_negative: + all_predictions[example.qas_id] = nbest_json[0]["text"] + else: + # predict "" iff the null score - the score of best non-null > threshold + score_diff = score_null + if best_non_null_entry: + score_diff -= best_non_null_entry.start_logit + best_non_null_entry.end_logit + scores_diff_json[example.qas_id] = score_diff + if score_diff > null_score_diff_threshold: + all_predictions[example.qas_id] = "" + else: + all_predictions[example.qas_id] = best_non_null_entry.text + all_nbest_json[example.qas_id] = nbest_json + process_bar.update(1) return all_predictions, all_nbest_json, scores_diff_json class ReadingComprehensionTask(BaseTask): def __init__(self, feature, - feed_list, - data_reader, + dataset=None, + feed_list=None, + data_reader=None, startup_program=None, config=None, metrics_choices=None, @@ -379,7 +384,9 @@ class ReadingComprehensionTask(BaseTask): max_answer_length=30): main_program = feature.block.program + self.data_reader = data_reader super(ReadingComprehensionTask, self).__init__( + dataset=dataset, data_reader=data_reader, main_program=main_program, feed_list=feed_list, @@ -387,7 +394,6 @@ class ReadingComprehensionTask(BaseTask): config=config, metrics_choices=metrics_choices) self.feature = feature - self.data_reader = data_reader self.sub_task = sub_task.lower() self.version_2_with_negative = (self.sub_task == "squad2.0") if self.sub_task in ["squad2.0", "squad"]: @@ -407,10 +413,10 @@ class ReadingComprehensionTask(BaseTask): "RawResult", ["unique_id", "start_logits", "end_logits"]) def _build_net(self): - self.unique_ids = fluid.layers.data( - name="unique_ids", shape=[-1, 1], lod_level=0, dtype="int64") + self.unique_id = fluid.layers.data( + name="unique_id", shape=[-1, 1], lod_level=0, dtype="int64") # to avoid memory optimization - _ = fluid.layers.assign(self.unique_ids) + _ = fluid.layers.assign(self.unique_id) logits = fluid.layers.fc( input=self.feature, size=2, @@ -432,24 +438,24 @@ class ReadingComprehensionTask(BaseTask): return [start_logits, end_logits, num_seqs] def _add_label(self): - start_positions = fluid.layers.data( - name="start_positions", shape=[-1, 1], lod_level=0, dtype="int64") - end_positions = fluid.layers.data( - name="end_positions", shape=[-1, 1], lod_level=0, dtype="int64") - return [start_positions, end_positions] + start_position = fluid.layers.data( + name="start_position", shape=[-1, 1], lod_level=0, dtype="int64") + end_position = fluid.layers.data( + name="end_position", shape=[-1, 1], lod_level=0, dtype="int64") + return [start_position, end_position] def _add_loss(self): - start_positions = self.labels[0] - end_positions = self.labels[1] + start_position = self.labels[0] + end_position = self.labels[1] start_logits = self.outputs[0] end_logits = self.outputs[1] start_loss = fluid.layers.softmax_with_cross_entropy( - logits=start_logits, label=start_positions) + logits=start_logits, label=start_position) start_loss = fluid.layers.mean(x=start_loss) end_loss = fluid.layers.softmax_with_cross_entropy( - logits=end_logits, label=end_positions) + logits=end_logits, label=end_position) end_loss = fluid.layers.mean(x=end_loss) total_loss = (start_loss + end_loss) / 2.0 return total_loss @@ -459,22 +465,25 @@ class ReadingComprehensionTask(BaseTask): @property def feed_list(self): - feed_list = [varname for varname in self._base_feed_list - ] + [self.unique_ids.name] - if self.is_train_phase or self.is_test_phase: - feed_list += [label.name for label in self.labels] + if self._compatible_mode: + feed_list = [varname for varname in self._base_feed_list + ] + [self.unique_id.name] + if self.is_train_phase or self.is_test_phase: + feed_list += [label.name for label in self.labels] + else: + feed_list = super(ReadingComprehensionTask, self).feed_list return feed_list @property def fetch_list(self): if self.is_train_phase or self.is_test_phase: return [ - self.loss.name, self.outputs[-1].name, self.unique_ids.name, + self.loss.name, self.outputs[-1].name, self.unique_id.name, self.outputs[0].name, self.outputs[1].name ] elif self.is_predict_phase: return [ - self.unique_ids.name, + self.unique_id.name, ] + [output.name for output in self.outputs] def _calculate_metrics(self, run_states): @@ -503,11 +512,17 @@ class ReadingComprehensionTask(BaseTask): run_time_used = time.time() - run_states[0].run_time_begin run_speed = run_step / run_time_used avg_loss = np.sum(total_cost) / np.sum(total_num_seqs) - scores = OrderedDict() + scores = collections.OrderedDict() # If none of metrics has been implemented, loss will be used to evaluate. if self.is_test_phase: - all_examples = self.data_reader.all_examples[self.phase] - all_features = self.data_reader.all_features[self.phase] + if self._compatible_mode: + all_examples = self.data_reader.all_examples[self.phase] + all_features = self.data_reader.all_features[self.phase] + dataset = self.data_reader.dataset + else: + all_examples = self.dataset.get_examples(self.phase) + all_features = self.dataset.get_features(self.phase) + dataset = self.dataset all_predictions, all_nbest_json, scores_diff_json = get_predictions( all_examples=all_examples, all_features=all_features, @@ -519,28 +534,23 @@ class ReadingComprehensionTask(BaseTask): null_score_diff_threshold=self.null_score_diff_threshold, is_english=self.is_english) if self.phase == 'val' or self.phase == 'dev': - with io.open( - self.data_reader.dataset.dev_path, 'r', - encoding="utf8") as dataset_file: - dataset_json = json.load(dataset_file) - dataset = dataset_json['data'] + dataset_path = dataset.dev_path elif self.phase == 'test': - with io.open( - self.data_reader.dataset.test_path, 'r', - encoding="utf8") as dataset_file: - dataset_json = json.load(dataset_file) - dataset = dataset_json['data'] + dataset_path = dataset.test_path else: raise Exception("Error phase: %s when runing _calculate_metrics" % self.phase) + with io.open(dataset_path, 'r', encoding="utf8") as dataset_file: + dataset_json = json.load(dataset_file) + data = dataset_json['data'] if self.sub_task == "squad": - scores = squad1_evaluate.evaluate(dataset, all_predictions) + scores = squad1_evaluate.evaluate(data, all_predictions) elif self.sub_task == "squad2.0": - scores = squad2_evaluate.evaluate(dataset, all_predictions, + scores = squad2_evaluate.evaluate(data, all_predictions, scores_diff_json) elif self.sub_task in ["cmrc2018", "drcd"]: - scores = cmrc2018_evaluate.get_eval(dataset, all_predictions) + scores = cmrc2018_evaluate.get_eval(data, all_predictions) return scores, avg_loss, run_speed def _postprocessing(self, run_states): @@ -558,8 +568,12 @@ class ReadingComprehensionTask(BaseTask): unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) - all_examples = self.data_reader.all_examples[self.phase] - all_features = self.data_reader.all_features[self.phase] + if self._compatible_mode: + all_examples = self.data_reader.all_examples[self.phase] + all_features = self.data_reader.all_features[self.phase] + else: + all_examples = self.dataset.get_examples(self.phase) + all_features = self.dataset.get_features(self.phase) all_predictions, all_nbest_json, scores_diff_json = get_predictions( all_examples=all_examples, all_features=all_features, diff --git a/paddlehub/finetune/task/regression_task.py b/paddlehub/finetune/task/regression_task.py index 3ddfbaf2cc27c19ca525c6220464396b1cd194b0..46892689917b2eddedcc3e0be22a257470b28d3b 100644 --- a/paddlehub/finetune/task/regression_task.py +++ b/paddlehub/finetune/task/regression_task.py @@ -29,8 +29,9 @@ from .base_task import BaseTask class RegressionTask(BaseTask): def __init__(self, feature, - feed_list, - data_reader, + dataset=None, + feed_list=None, + data_reader=None, startup_program=None, config=None, hidden_units=None, @@ -40,6 +41,7 @@ class RegressionTask(BaseTask): main_program = feature.block.program super(RegressionTask, self).__init__( + dataset=dataset, data_reader=data_reader, main_program=main_program, feed_list=feed_list, diff --git a/paddlehub/finetune/task/sequence_task.py b/paddlehub/finetune/task/sequence_task.py index ac46c990a2cf990f5c9cf7e9bbde3cfc9ae1f270..e71d51265938dc1a809cd8a0dcf6a541a0e14d92 100644 --- a/paddlehub/finetune/task/sequence_task.py +++ b/paddlehub/finetune/task/sequence_task.py @@ -21,10 +21,9 @@ import time from collections import OrderedDict import numpy as np -import paddle import paddle.fluid as fluid from paddlehub.finetune.evaluate import chunk_eval, calculate_f1 -from paddlehub.common.utils import version_compare +from paddlehub.common.logger import logger from .base_task import BaseTask @@ -33,8 +32,9 @@ class SequenceLabelTask(BaseTask): feature, max_seq_len, num_classes, - feed_list, - data_reader, + dataset=None, + feed_list=None, + data_reader=None, startup_program=None, config=None, metrics_choices="default", @@ -46,6 +46,7 @@ class SequenceLabelTask(BaseTask): main_program = feature.block.program super(SequenceLabelTask, self).__init__( + dataset=dataset, data_reader=data_reader, main_program=main_program, feed_list=feed_list, @@ -199,11 +200,14 @@ class SequenceLabelTask(BaseTask): @property def feed_list(self): - feed_list = [varname for varname in self._base_feed_list] - if self.is_train_phase or self.is_test_phase: - feed_list += [self.labels[0].name, self.seq_len.name] + if self._compatible_mode: + feed_list = [varname for varname in self._base_feed_list] + if self.is_train_phase or self.is_test_phase: + feed_list += [self.labels[0].name, self.seq_len.name] + else: + feed_list += [self.seq_len.name] else: - feed_list += [self.seq_len.name] + feed_list = super(SequenceLabelTask, self).feed_list return feed_list @property @@ -215,10 +219,22 @@ class SequenceLabelTask(BaseTask): return [output.name for output in self.outputs] def _postprocessing(self, run_states): - id2label = { - val: key - for key, val in self._base_data_reader.label_map.items() - } + if self._compatible_mode: + id2label = { + val: key + for key, val in self._base_data_reader.label_map.items() + } + else: + if self._label_list: + id2label = {} + for index, label in enumerate(self._label_list): + id2label[index] = label + else: + logger.warning( + "Fail to postprocess the predict output. Please set label_list parameter in predict function or initialize the task with dataset parameter." + ) + return run_states + results = [] for batch_states in run_states: batch_results = batch_states.run_results diff --git a/paddlehub/reader/nlp_reader.py b/paddlehub/reader/nlp_reader.py index 7cf4cf67ce70de850ca0c04dc4cc37d58ee8c6f2..237d1e62741085341e5af01429d5b112ea4a4e17 100644 --- a/paddlehub/reader/nlp_reader.py +++ b/paddlehub/reader/nlp_reader.py @@ -688,11 +688,13 @@ class Features(object): s = "" s += "unique_id: %s " % self.unique_id s += "example_index: %s " % self.example_index + s += "doc_span_index: %s" % self.doc_span_index + s += "tokens: %s" % self.tokens + s += "token_to_orig_map %s" % self.token_to_orig_map + s += "token_is_max_context %s" % self.token_is_max_context s += "start_position: %s " % self.start_position s += "end_position: %s " % self.end_position s += "is_impossible: %s " % self.is_impossible - # s += "tokens: %s" % self.tokens - # s += "token_to_orig_map %s" % self.token_to_orig_map return s diff --git a/paddlehub/reader/tokenization.py b/paddlehub/reader/tokenization.py index bde0ed43cd5140d2d926b5e43d53e0f55ed91205..404ae4993e07f455fa1790db08fa880545b0228a 100644 --- a/paddlehub/reader/tokenization.py +++ b/paddlehub/reader/tokenization.py @@ -140,29 +140,6 @@ class FullTokenizer(object): return convert_by_vocab(self.inv_vocab, ids) -class CharTokenizer(object): - """Runs end-to-end tokenziation.""" - - def __init__(self, vocab_file, do_lower_case=True): - self.vocab = load_vocab(vocab_file) - self.inv_vocab = {v: k for k, v in self.vocab.items()} - self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) - - def tokenize(self, text): - split_tokens = [] - for token in text.lower().split(" "): - for sub_token in self.wordpiece_tokenizer.tokenize(token): - split_tokens.append(sub_token) - - return split_tokens - - def convert_tokens_to_ids(self, tokens): - return convert_by_vocab(self.vocab, tokens) - - def convert_ids_to_tokens(self, ids): - return convert_by_vocab(self.inv_vocab, ids) - - class WSSPTokenizer(object): def __init__(self, vocab_file, sp_model_dir, word_dict, ws=True, lower=True): diff --git a/paddlehub/tokenizer/__init__.py b/paddlehub/tokenizer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..11452e63cfda2f4da74b327943a6563b7bd2d44e --- /dev/null +++ b/paddlehub/tokenizer/__init__.py @@ -0,0 +1,2 @@ +from .bert_tokenizer import BertTokenizer +from .bert_tokenizer import ErnieTinyTokenizer diff --git a/paddlehub/tokenizer/bert_tokenizer.py b/paddlehub/tokenizer/bert_tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..ef45b06c4be6913a5679ca8700e4153c91c8f88f --- /dev/null +++ b/paddlehub/tokenizer/bert_tokenizer.py @@ -0,0 +1,852 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""This file is modified from https://github.com/huggingface/transformers""" + +import collections +import os +import unicodedata +import pickle +from typing import Dict, List, Optional, Union, Tuple + +import sentencepiece as spm + +from .tokenizer_util import load_vocab, is_whitespace, is_control, is_punctuation, whitespace_tokenize, is_chinese_char + + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True): + """ Constructs a BasicTokenizer. + + Args: + do_lower_case: Whether to lower case the input. + never_split: (`optional`) list of str + List of token not to split. + tokenize_chinese_chars: (`optional`) boolean (default True) + Whether to tokenize Chinese characters. + This should likely be deactivated for Japanese: + see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328 + """ + if never_split is None: + never_split = [] + self.do_lower_case = do_lower_case + self.never_split = never_split + self.tokenize_chinese_chars = tokenize_chinese_chars + + def tokenize(self, text, never_split=None): + """ Basic Tokenization of a piece of text. + Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer. + + Args: + **never_split**: (`optional`) list of str + List of token not to split. + """ + never_split = self.never_split + (never_split + if never_split is not None else []) + text = self._clean_text(text) + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + if self.tokenize_chinese_chars: + text = self._tokenize_chinese_chars(text) + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case and token not in never_split: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token, never_split)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text, never_split=None): + """Splits punctuation on a piece of text.""" + if never_split is not None and text in never_split: + return [text] + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + if is_chinese_char(char): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xFFFD or is_control(char): + continue + if is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + def encode(self): + raise NotImplementedError( + "This tokenizer can only do tokenize(...), " + "the ability to convert tokens to ids has not been implemented") + + def decode(self): + raise NotImplementedError( + "This tokenizer can only do tokenize(...), " + "the ability to convert ids to tokens has not been implemented") + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenization.""" + + def __init__(self, vocab, unk_token, max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer`. + + Returns: + A list of wordpiece tokens. + """ + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + def encode(self): + raise NotImplementedError( + "This tokenizer can only do tokenize(...), " + "the ability to convert tokens to ids has not been implemented") + + def decode(self): + raise NotImplementedError( + "This tokenizer can only do tokenize(...), " + "the ability to convert ids to tokens has not been implemented") + + +class BertTokenizer(object): + """ + Constructs a BERT tokenizer. Based on WordPiece. + + Args: + vocab_file (:obj:`string`): + File containing the vocabulary. + do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to lowercase the input when tokenizing. + do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to do basic tokenization before WordPiece. + never_split (:obj:`bool`, `optional`, defaults to :obj:`True`): + List of tokens which will never be split during tokenization. Only has an effect when + :obj:`do_basic_tokenize=True` + unk_token (:obj:`string`, `optional`, defaults to "[UNK]"): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (:obj:`string`, `optional`, defaults to "[SEP]"): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences + for sequence classification or for a text and a question for question answering. + It is also used as the last token of a sequence built with special tokens. + pad_token (:obj:`string`, `optional`, defaults to "[PAD]"): + The token used for padding, for example when batching sequences of different lengths. + cls_token (:obj:`string`, `optional`, defaults to "[CLS]"): + The classifier token which is used when doing sequence classification (classification of the whole + sequence instead of per-token classification). It is the first token of the sequence when built with + special tokens. + mask_token (:obj:`string`, `optional`, defaults to "[MASK]"): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to tokenize Chinese characters. + This should likely be deactivated for Japanese: + see: https://github.com/huggingface/transformers/issues/328 + """ + + def __init__( + self, + vocab_file, + do_lower_case=True, + do_basic_tokenize=True, + never_split=None, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + tokenize_chinese_chars=True, + ): + self.unk_token = unk_token + self.sep_token = sep_token + self.pad_token = pad_token + self.cls_token = cls_token + self.mask_token = mask_token + self.do_lower_case = do_lower_case + self.all_special_tokens = [ + unk_token, sep_token, pad_token, cls_token, mask_token + ] + + if not os.path.isfile(vocab_file): + raise ValueError( + "Can't find a vocabulary file at path '{}'.".format(vocab_file)) + self.vocab = load_vocab(vocab_file) + self.ids_to_tokens = collections.OrderedDict( + [(ids, tok) for tok, ids in self.vocab.items()]) + self.do_basic_tokenize = do_basic_tokenize + if do_basic_tokenize: + self.basic_tokenizer = BasicTokenizer( + do_lower_case=do_lower_case, + never_split=never_split, + tokenize_chinese_chars=tokenize_chinese_chars) + self.wordpiece_tokenizer = WordpieceTokenizer( + vocab=self.vocab, unk_token=self.unk_token) + + self.unk_token_id = self.convert_tokens_to_ids(self.unk_token) + self.sep_token_id = self.convert_tokens_to_ids(self.sep_token) + self.pad_token_id = self.convert_tokens_to_ids(self.pad_token) + self.pad_token_type_id = 0 + self.cls_token_id = self.convert_tokens_to_ids(self.cls_token) + self.mask_token_id = self.convert_tokens_to_ids(self.mask_token) + self.all_special_ids = self.convert_tokens_to_ids( + self.all_special_tokens) + + @property + def vocab_size(self): + return len(self.vocab) + + def get_vocab(self): + return dict(self.vocab) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.vocab.get(token, self.vocab.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.ids_to_tokens.get(index, self.unk_token) + + def convert_tokens_to_string(self, tokens): + """ Converts a sequence of tokens (string) in a single string. """ + out_string = " ".join(tokens).replace(" ##", "").strip() + return out_string + + def convert_tokens_to_ids(self, tokens): + """ Converts a token string (or a sequence of tokens) in a single integer id + (or a sequence of ids), using the vocabulary. + """ + if tokens is None: + return None + + if isinstance(tokens, str): + return self._convert_token_to_id(tokens) + + ids = [] + for token in tokens: + ids.append(self._convert_token_to_id(token)) + return ids + + def convert_ids_to_tokens(self, + ids: Union[int, List[int]], + skip_special_tokens: bool = False + ) -> Union[int, List[int]]: + """ Converts a single index or a sequence of indices (integers) in a token " + (resp.) a sequence of tokens (str), using the vocabulary and added tokens. + + Args: + skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False + """ + if isinstance(ids, int): + return self._convert_id_to_token(ids) + tokens = [] + for index in ids: + index = int(index) + if skip_special_tokens and index in self.all_special_ids: + continue + tokens.append(self._convert_id_to_token(index)) + return tokens + + def tokenize(self, text): + """ Converts a string in a sequence of tokens (string), using the tokenizer. + Split in words for word-based vocabulary or sub-words for sub-word-based + vocabularies (BPE/SentencePieces/WordPieces). + + Take care of added tokens. + + Args: + text (:obj:`string`): The sequence to be encoded. + """ + split_tokens = [] + if self.do_basic_tokenize: + for token in self.basic_tokenizer.tokenize( + text, never_split=self.all_special_tokens): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + else: + split_tokens = self.wordpiece_tokenizer.tokenize(text) + return split_tokens + + def build_inputs_with_special_tokens(self, + token_ids_0: List[int], + token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks + by concatenating and adding special tokens. + A BERT sequence has the following format: + + - single sequence: ``[CLS] X [SEP]`` + - pair of sequences: ``[CLS] A [SEP] B [SEP]`` + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: list of `input IDs` with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + token_ids_1 + sep + + def num_special_tokens_to_add(self, pair=False): + """ + Returns the number of added tokens when encoding a sequence with special tokens. + + Note: + This encodes inputs and checks the number of added tokens, and is therefore not efficient. Do not put this + inside your training loop. + + Args: + pair: Returns the number of added tokens in the case of a sequence pair if set to True, returns the + number of added tokens in the case of a single sequence if set to False. + + Returns: + Number of tokens added to sequences + """ + token_ids_0 = [] + token_ids_1 = [] + return len( + self.build_inputs_with_special_tokens( + token_ids_0, token_ids_1 if pair else None)) + + def get_special_tokens_mask( + self, + token_ids_0: List[int], + token_ids_1: Optional[List[int]] = None, + already_has_special_tokens: bool = False) -> List[int]: + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + + Args: + token_ids_0 (:obj:`List[int]`): + List of ids. + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True if the token list is already formatted with special tokens for the model + + Returns: + :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formated with special tokens for the model." + ) + return list( + map( + lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] + else 0, token_ids_0)) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ( + [0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + def create_segment_ids_from_sequences( + self, token_ids_0: List[int], + token_ids_1: Optional[List[int]] = None) -> List[int]: + """ + Creates a mask from the two sequences passed to be used in a sequence-pair classification task. + A BERT sequence pair mask has the following format: + + :: + + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + + if token_ids_1 is None, only returns the first portion of the mask (0's). + + Args: + token_ids_0 (:obj:`List[int]`): + List of ids. + token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `token type IDs` according to the given sequence(s). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + + def clean_up_tokenization(self, out_string: str) -> str: + """ Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms. + """ + out_string = (out_string.replace(" .", ".").replace(" ?", "?").replace( + " !", "!").replace(" ,", ",").replace(" ' ", "'").replace( + " n't", + "n't").replace(" 'm", "'m").replace(" 's", "'s").replace( + " 've", "'ve").replace(" 're", "'re")) + return out_string + + def truncate_sequences( + self, + ids: List[int], + pair_ids: Optional[List[int]] = None, + num_tokens_to_remove: int = 0, + truncation_strategy: str = "longest_first", + stride: int = 0, + ) -> Tuple[List[int], List[int], List[int]]: + """ Truncates a sequence pair in place to the maximum length. + + Args: + ids: list of tokenized input ids. Can be obtained from a string by chaining the + `tokenize` and `convert_tokens_to_ids` methods. + pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the + `tokenize` and `convert_tokens_to_ids` methods. + num_tokens_to_remove (:obj:`int`, `optional`, defaults to ``0``): + number of tokens to remove using the truncation strategy + truncation_strategy: string selected in the following options: + - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_seq_len + starting from the longest one at each token (when there is a pair of input sequences). + Overflowing tokens only contains overflow from the first sequence. + - 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove. + - 'only_second': Only truncate the second sequence + - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_seq_len) + stride (:obj:`int`, `optional`, defaults to ``0``): + If set to a number along with max_seq_len, the overflowing tokens returned will contain some tokens + from the main sequence returned. The value of this argument defines the number of additional tokens. + """ + if num_tokens_to_remove <= 0: + return ids, pair_ids, [] + + if truncation_strategy == "longest_first": + overflowing_tokens = [] + for _ in range(num_tokens_to_remove): + if pair_ids is None or len(ids) > len(pair_ids): + overflowing_tokens = [ids[-1]] + overflowing_tokens + ids = ids[:-1] + else: + pair_ids = pair_ids[:-1] + window_len = min(len(ids), stride) + if window_len > 0: + overflowing_tokens = ids[-window_len:] + overflowing_tokens + elif truncation_strategy == "only_first": + assert len(ids) > num_tokens_to_remove + window_len = min(len(ids), stride + num_tokens_to_remove) + overflowing_tokens = ids[-window_len:] + ids = ids[:-num_tokens_to_remove] + elif truncation_strategy == "only_second": + assert pair_ids is not None and len(pair_ids) > num_tokens_to_remove + window_len = min(len(pair_ids), stride + num_tokens_to_remove) + overflowing_tokens = pair_ids[-window_len:] + pair_ids = pair_ids[:-num_tokens_to_remove] + elif truncation_strategy == "do_not_truncate": + raise ValueError( + "Input sequence are too long for max_seq_len. Please select a truncation strategy." + ) + else: + raise ValueError( + "Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']" + ) + return (ids, pair_ids, overflowing_tokens) + + def encode(self, + text: Union[str, List[str], List[int]], + text_pair: Optional[Union[str, List[str], List[int]]] = None, + max_seq_len: Optional[int] = None, + pad_to_max_seq_len: bool = True, + truncation_strategy: str = "longest_first", + return_position_ids: bool = True, + return_segment_ids: bool = True, + return_input_mask: bool = True, + return_length: bool = True, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False): + """ + Returns a dictionary containing the encoded sequence or sequence pair and additional information: + the mask for sequence classification and the overflowing elements if a ``max_seq_len`` is specified. + + Args: + text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`): + The first sequence to be encoded. This can be a string, a list of strings (tokenized string using + the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` + method) + text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second sequence to be encoded. This can be a string, a list of strings (tokenized + string using the `tokenize` method) or a list of integers (tokenized string ids using the + `convert_tokens_to_ids` method) + max_seq_len (:obj:`int`, `optional`, defaults to :int:`None`): + If set to a number, will limit the total sequence returned so that it has a maximum length. + If there are overflowing tokens, those will be added to the returned dictionary + pad_to_max_seq_len (:obj:`bool`, `optional`, defaults to :obj:`True`): + If set to True, the returned sequences will be padded according to the model's padding side and + padding index, up to their max length. If no max length is specified, the padding is done up to the + model's max length. + truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`): + String selected in the following options: + + - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_seq_len + starting from the longest one at each token (when there is a pair of input sequences) + - 'only_first': Only truncate the first sequence + - 'only_second': Only truncate the second sequence + - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_seq_len) + return_position_ids (:obj:`bool`, `optional`, defaults to :obj:`True`): + Set to True to return tokens position ids (default True). + return_segment_ids (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to return token type IDs. + return_input_mask (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether to return the attention mask. + return_length (:obj:`int`, defaults to :obj:`True`): + If set the resulting dictionary will include the length of each encoded inputs + return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True to return overflowing token information (default False). + return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True to return special tokens mask information (default False). + + Return: + A Dictionary of shape:: + + { + input_ids: list[int], + position_ids: list[int] if return_position_ids is True (default) + segment_ids: list[int] if return_segment_ids is True (default) + input_mask: list[int] if return_input_mask is True (default) + seq_len: int if return_length is True (default) + overflowing_tokens: list[int] if a ``max_seq_len`` is specified and return_overflowing_tokens is True + num_truncated_tokens: int if a ``max_seq_len`` is specified and return_overflowing_tokens is True + special_tokens_mask: list[int] if return_special_tokens_mask is True + } + + With the fields: + + - ``input_ids``: list of token ids to be fed to a model + - ``position_ids``: list of token position ids to be fed to a model + - ``segment_ids``: list of token type ids to be fed to a model + - ``input_mask``: list of indices specifying which tokens should be attended to by the model + - ``length``: the input_ids length + - ``overflowing_tokens``: list of overflowing tokens if a max length is specified. + - ``num_truncated_tokens``: number of overflowing tokens a ``max_seq_len`` is specified + - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added + tokens and 1 specifying sequence tokens. + """ + + def get_input_ids(text): + if isinstance(text, str): + tokens = self.tokenize(text) + return self.convert_tokens_to_ids(tokens) + elif isinstance(text, + (list, tuple)) and len(text) > 0 and isinstance( + text[0], str): + return self.convert_tokens_to_ids(text) + elif isinstance(text, + (list, tuple)) and len(text) > 0 and isinstance( + text[0], int): + return text + else: + raise ValueError( + "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers." + ) + + ids = get_input_ids(text) + pair_ids = get_input_ids(text_pair) if text_pair is not None else None + + pair = bool(pair_ids is not None) + len_ids = len(ids) + len_pair_ids = len(pair_ids) if pair else 0 + + encoded_inputs = {} + + # Truncation: Handle max sequence length + total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add( + pair=pair)) + if max_seq_len and total_len > max_seq_len: + ids, pair_ids, overflowing_tokens = self.truncate_sequences( + ids, + pair_ids=pair_ids, + num_tokens_to_remove=total_len - max_seq_len, + truncation_strategy=truncation_strategy, + ) + if return_overflowing_tokens: + encoded_inputs["overflowing_tokens"] = overflowing_tokens + encoded_inputs["num_truncated_tokens"] = total_len - max_seq_len + + # Add special tokens + + sequence = self.build_inputs_with_special_tokens(ids, pair_ids) + segment_ids = self.create_segment_ids_from_sequences(ids, pair_ids) + + # Build output dictionnary + encoded_inputs["input_ids"] = sequence + if return_segment_ids: + encoded_inputs["segment_ids"] = segment_ids + if return_special_tokens_mask: + encoded_inputs[ + "special_tokens_mask"] = self.get_special_tokens_mask( + ids, pair_ids) + if return_length: + encoded_inputs["seq_len"] = len(encoded_inputs["input_ids"]) + + # Check lengths + assert max_seq_len is None or len( + encoded_inputs["input_ids"]) <= max_seq_len + + # Padding + needs_to_be_padded = pad_to_max_seq_len and \ + max_seq_len and len(encoded_inputs["input_ids"]) < max_seq_len + + if needs_to_be_padded: + difference = max_seq_len - len(encoded_inputs["input_ids"]) + if return_input_mask: + encoded_inputs["input_mask"] = [1] * len( + encoded_inputs["input_ids"]) + [0] * difference + if return_segment_ids: + encoded_inputs["segment_ids"] = ( + encoded_inputs["segment_ids"] + + [self.pad_token_type_id] * difference) + if return_special_tokens_mask: + encoded_inputs["special_tokens_mask"] = encoded_inputs[ + "special_tokens_mask"] + [1] * difference + encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [ + self.pad_token_id + ] * difference + else: + if return_input_mask: + encoded_inputs["input_mask"] = [1] * len( + encoded_inputs["input_ids"]) + + if return_position_ids: + encoded_inputs["position_ids"] = list( + range(len(encoded_inputs["input_ids"]))) + + return encoded_inputs + + def decode(self, + token_ids: Union[List[int], Dict], + only_convert_to_tokens: bool = False, + skip_pad_token: bool = False, + skip_special_tokens: bool = False, + clean_up_tokenization_spaces: bool = True): + """ + Converts a sequence of ids (integer) to a string if only_convert_to_tokens is False or a list a sequence of tokens (str) + when only_convert_to_tokens is True. + + Args: + token_ids: list of tokenized input ids or dict containing a key called "input_ids", can be obtained using the `encode` methods. + only_convert_to_tokens: if set to True, will only return a list a sequence of tokens (str). `paddlehub.dataset.base_nlp_dataset` will use this optional argument. + skip_pad_token: if set to True, will replace pad tokens. + skip_special_tokens: if set to True, will replace special tokens. + clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces. + """ + if isinstance(token_ids, dict): + token_ids = token_ids["input_ids"] + + filtered_tokens = self.convert_ids_to_tokens( + token_ids, skip_special_tokens=skip_special_tokens) + + tokens = [] + for token in filtered_tokens: + if skip_pad_token and token == self.pad_token: + continue + tokens.append(token) + if only_convert_to_tokens: + return tokens + + if tokens: + text = self.convert_tokens_to_string(tokens) + else: + text = "" + + if clean_up_tokenization_spaces: + clean_text = self.clean_up_tokenization(text) + return clean_text + else: + return text + + +class ErnieTinyTokenizer(BertTokenizer): + def __init__( + self, + vocab_file, + spm_path, + word_dict_path, + do_lower_case=True, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + ): + self.unk_token = unk_token + self.sep_token = sep_token + self.pad_token = pad_token + self.cls_token = cls_token + self.mask_token = mask_token + self.do_lower_case = do_lower_case + self.all_special_tokens = [ + unk_token, sep_token, pad_token, cls_token, mask_token + ] + + if not os.path.isfile(vocab_file): + raise ValueError( + "Can't find a vocabulary file at path '{}'.".format(vocab_file)) + self.vocab = load_vocab(vocab_file) + self.ids_to_tokens = collections.OrderedDict( + [(ids, tok) for tok, ids in self.vocab.items()]) + + # Here is the difference with BertTokenizer. + self.dict = pickle.load(open(word_dict_path, 'rb')) + self.sp_model = spm.SentencePieceProcessor() + self.window_size = 5 + self.sp_model.Load(spm_path) + + self.unk_token_id = self.convert_tokens_to_ids(self.unk_token) + self.sep_token_id = self.convert_tokens_to_ids(self.sep_token) + self.pad_token_id = self.convert_tokens_to_ids(self.pad_token) + self.pad_token_type_id = 0 + self.cls_token_id = self.convert_tokens_to_ids(self.cls_token) + self.mask_token_id = self.convert_tokens_to_ids(self.mask_token) + self.all_special_ids = self.convert_tokens_to_ids( + self.all_special_tokens) + + def cut(self, chars): + words = [] + idx = 0 + while idx < len(chars): + matched = False + for i in range(self.window_size, 0, -1): + cand = chars[idx:idx + i] + if cand in self.dict: + words.append(cand) + matched = True + break + if not matched: + i = 1 + words.append(chars[idx]) + idx += i + return words + + def tokenize(self, text): + text = [s for s in self.cut(text) if s != ' '] + if self.do_lower_case: + text = [s.lower() for s in text] + text = ' '.join(text) + tokens = self.sp_model.EncodeAsPieces(text) + in_vocab_tokens = [] + for token in tokens: + if token in self.vocab: + in_vocab_tokens.append(token) + else: + in_vocab_tokens.append(self.unk_token) + return in_vocab_tokens diff --git a/paddlehub/tokenizer/tokenizer_util.py b/paddlehub/tokenizer/tokenizer_util.py new file mode 100644 index 0000000000000000000000000000000000000000..6ab0199dea705c282d383bf208846da43efa6b8b --- /dev/null +++ b/paddlehub/tokenizer/tokenizer_util.py @@ -0,0 +1,86 @@ +from collections import OrderedDict +import unicodedata + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = {} + with open(vocab_file, "r", encoding="utf-8") as reader: + tokens = reader.readlines() + for index, token in enumerate(tokens): + token = token.rstrip("\n").split("\t")[0] + vocab[token] = index + return vocab + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +def is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat.startswith("C"): + return True + return False + + +def is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or ( + cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False + + +def is_chinese_char(char): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + cp = ord(char) + if ((cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF) # + or (cp >= 0x20000 and cp <= 0x2A6DF) # + or (cp >= 0x2A700 and cp <= 0x2B73F) # + or (cp >= 0x2B740 and cp <= 0x2B81F) # + or (cp >= 0x2B820 and cp <= 0x2CEAF) # + or (cp >= 0xF900 and cp <= 0xFAFF) + or (cp >= 0x2F800 and cp <= 0x2FA1F) # + ): # + return True + + return False diff --git a/requirements.txt b/requirements.txt index 5824da115cd595900900a4baba6d4bddb35f7744..37372665c93089f01b9331dd9e1bd9ff6b9f116b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,6 +8,7 @@ visualdl >= 2.0.0b cma >= 2.7.0 sentencepiece colorlog +tqdm # pandas no longer support python2 in version 0.25 and above pandas ; python_version >= "3"