未验证 提交 a1cf32cd 编写于 作者: K kinghuin 提交者: GitHub

Tokenizer refactor (#677)

上级 a253ecaa
...@@ -39,18 +39,17 @@ if __name__ == '__main__': ...@@ -39,18 +39,17 @@ if __name__ == '__main__':
inputs, outputs, program = module.context( inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len) trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use MultiLabelReader to read dataset # Use the appropriate tokenizer to preprocess the data set
dataset = hub.dataset.Toxic() # For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
reader = hub.reader.MultiLabelClassifyReader( if module.name == "ernie_tiny":
dataset=dataset, tokenizer = hub.ErnieTinyTokenizer(
vocab_path=module.get_vocab_path(), vocab_file=module.get_vocab_path(),
max_seq_len=args.max_seq_len) spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
# Setup feed list for data feeder else:
feed_list = [ tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
inputs["input_ids"].name, inputs["position_ids"].name, dataset = hub.dataset.Toxic(
inputs["segment_ids"].name, inputs["input_mask"].name tokenizer=tokenizer, max_seq_len=args.max_seq_len)
]
# Construct transfer learning network # Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence. # Use "pooled_output" for classification tasks on an entire sentence.
...@@ -72,9 +71,8 @@ if __name__ == '__main__': ...@@ -72,9 +71,8 @@ if __name__ == '__main__':
# Define a classfication fine-tune task by PaddleHub's API # Define a classfication fine-tune task by PaddleHub's API
multi_label_cls_task = hub.MultiLabelClassifierTask( multi_label_cls_task = hub.MultiLabelClassifierTask(
data_reader=reader, dataset=dataset,
feature=pooled_output, feature=pooled_output,
feed_list=feed_list,
num_classes=dataset.num_labels, num_classes=dataset.num_labels,
config=config) config=config)
......
...@@ -45,20 +45,11 @@ if __name__ == '__main__': ...@@ -45,20 +45,11 @@ if __name__ == '__main__':
inputs, outputs, program = module.context( inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len) trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use MultiLabelReader to read dataset # Download dataset and get its label list and label num
# If you just want labels information, you can omit its tokenizer parameter to avoid preprocessing the train set.
dataset = hub.dataset.Toxic() dataset = hub.dataset.Toxic()
reader = hub.reader.MultiLabelClassifyReader( num_classes = dataset.num_labels
dataset=dataset, label_list = dataset.get_labels()
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len)
# Setup feed list for data feeder
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Construct transfer learning network # Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence. # Use "pooled_output" for classification tasks on an entire sentence.
...@@ -75,20 +66,29 @@ if __name__ == '__main__': ...@@ -75,20 +66,29 @@ if __name__ == '__main__':
# Define a classfication fine-tune task by PaddleHub's API # Define a classfication fine-tune task by PaddleHub's API
multi_label_cls_task = hub.MultiLabelClassifierTask( multi_label_cls_task = hub.MultiLabelClassifierTask(
data_reader=reader, dataset=dataset,
feature=pooled_output, feature=pooled_output,
feed_list=feed_list,
num_classes=dataset.num_labels, num_classes=dataset.num_labels,
config=config) config=config)
# Data to be predicted # Data to be predicted
data = [ data = [
[ "Yes you did. And you admitted to doing it. See the Warren Kinsella talk page.",
"Yes you did. And you admitted to doing it. See the Warren Kinsella talk page." "I asked you a question. We both know you have my page on your watch list, so are why are you playing games and making me formally ping you? Makin'Bacon",
],
[
"I asked you a question. We both know you have my page on your watch list, so are why are you playing games and making me formally ping you? Makin'Bacon"
],
] ]
# Use the appropriate tokenizer to preprocess the data
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
print(multi_label_cls_task.predict(data=data, return_result=True)) encoded_data = [
tokenizer.encode(text=text, max_seq_len=args.max_seq_len)
for text in data
]
print(
multi_label_cls_task.predict(data=encoded_data, label_list=label_list))
...@@ -36,31 +36,28 @@ args = parser.parse_args() ...@@ -36,31 +36,28 @@ args = parser.parse_args()
if __name__ == '__main__': if __name__ == '__main__':
# Load Paddlehub ERNIE pretrained model # Load Paddlehub ERNIE pretrained model
module = hub.Module(name="ernie") module = hub.Module(name="ernie_tiny")
inputs, outputs, program = module.context( inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len) trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use ClassifyReader to read dataset # Use the appropriate tokenizer to preprocess the data set
dataset = hub.dataset.NLPCC_DBQA() # For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
reader = hub.reader.ClassifyReader( if module.name == "ernie_tiny":
dataset=dataset, tokenizer = hub.ErnieTinyTokenizer(
vocab_path=module.get_vocab_path(), vocab_file=module.get_vocab_path(),
max_seq_len=args.max_seq_len) spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
dataset = hub.dataset.NLPCC_DBQA(
tokenizer=tokenizer, max_seq_len=args.max_seq_len)
# Construct transfer learning network # Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence. # Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output. # Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"] pooled_output = outputs["pooled_output"]
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Select fine-tune strategy, setup config and fine-tune # Select fine-tune strategy, setup config and fine-tune
strategy = hub.AdamWeightDecayStrategy( strategy = hub.AdamWeightDecayStrategy(
warmup_proportion=args.warmup_proportion, warmup_proportion=args.warmup_proportion,
...@@ -78,9 +75,8 @@ if __name__ == '__main__': ...@@ -78,9 +75,8 @@ if __name__ == '__main__':
# Define a classfication fine-tune task by PaddleHub's API # Define a classfication fine-tune task by PaddleHub's API
cls_task = hub.TextClassifierTask( cls_task = hub.TextClassifierTask(
data_reader=reader, dataset=dataset,
feature=pooled_output, feature=pooled_output,
feed_list=feed_list,
num_classes=dataset.num_labels, num_classes=dataset.num_labels,
config=config) config=config)
......
...@@ -39,30 +39,20 @@ args = parser.parse_args() ...@@ -39,30 +39,20 @@ args = parser.parse_args()
if __name__ == '__main__': if __name__ == '__main__':
# loading Paddlehub ERNIE pretrained model # loading Paddlehub ERNIE pretrained model
module = hub.Module(name="ernie") module = hub.Module(name="ernie_tiny")
inputs, outputs, program = module.context(max_seq_len=args.max_seq_len) inputs, outputs, program = module.context(max_seq_len=args.max_seq_len)
# Sentence classification dataset reader # Download dataset and get its label list and label num
# If you just want labels information, you can omit its tokenizer parameter to avoid preprocessing the train set.
dataset = hub.dataset.NLPCC_DBQA() dataset = hub.dataset.NLPCC_DBQA()
reader = hub.reader.ClassifyReader( num_classes = dataset.num_labels
dataset=dataset, label_list = dataset.get_labels()
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len)
# Construct transfer learning network # Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence. # Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output. # Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"] pooled_output = outputs["pooled_output"]
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Setup RunConfig for PaddleHub Fine-tune API # Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig( config = hub.RunConfig(
use_data_parallel=False, use_data_parallel=False,
...@@ -73,9 +63,8 @@ if __name__ == '__main__': ...@@ -73,9 +63,8 @@ if __name__ == '__main__':
# Define a classfication fine-tune task by PaddleHub's API # Define a classfication fine-tune task by PaddleHub's API
cls_task = hub.TextClassifierTask( cls_task = hub.TextClassifierTask(
data_reader=reader, dataset=dataset,
feature=pooled_output, feature=pooled_output,
feed_list=feed_list,
num_classes=dataset.num_labels, num_classes=dataset.num_labels,
config=config) config=config)
...@@ -83,5 +72,18 @@ if __name__ == '__main__': ...@@ -83,5 +72,18 @@ if __name__ == '__main__':
data = [["北京奥运博物馆的场景效果负责人是谁?", "主要承担奥运文物征集、保管、研究和爱国主义教育基地建设相关工作。"], data = [["北京奥运博物馆的场景效果负责人是谁?", "主要承担奥运文物征集、保管、研究和爱国主义教育基地建设相关工作。"],
["北京奥运博物馆的场景效果负责人是谁", "于海勃,美国加利福尼亚大学教授 场景效果负责人 总设计师"], ["北京奥运博物馆的场景效果负责人是谁", "于海勃,美国加利福尼亚大学教授 场景效果负责人 总设计师"],
["北京奥运博物馆的场景效果负责人是谁?", "洪麦恩,清华大学美术学院教授 内容及主展线负责人 总设计师"]] ["北京奥运博物馆的场景效果负责人是谁?", "洪麦恩,清华大学美术学院教授 内容及主展线负责人 总设计师"]]
# Use the appropriate tokenizer to preprocess the data
print(cls_task.predict(data=data, return_result=True)) # For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
encoded_data = [
tokenizer.encode(
text=text, text_pair=text_pair, max_seq_len=args.max_seq_len)
for text, text_pair in data
]
print(cls_task.predict(data=encoded_data, label_list=label_list))
...@@ -17,7 +17,6 @@ ...@@ -17,7 +17,6 @@
import argparse import argparse
import ast import ast
import paddle.fluid as fluid
import paddlehub as hub import paddlehub as hub
hub.common.logger.logger.setLevel("INFO") hub.common.logger.logger.setLevel("INFO")
...@@ -42,28 +41,23 @@ if __name__ == '__main__': ...@@ -42,28 +41,23 @@ if __name__ == '__main__':
inputs, outputs, program = module.context( inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len) trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use ReadingComprehensionReader to read dataset # Use the appropriate tokenizer to preprocess the data set
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
# If you wanna load SQuAD 2.0 dataset, just set version_2_with_negative as True # If you wanna load SQuAD 2.0 dataset, just set version_2_with_negative as True
dataset = hub.dataset.SQUAD(version_2_with_negative=False) dataset = hub.dataset.SQUAD(
version_2_with_negative=False,
tokenizer=tokenizer,
max_seq_len=args.max_seq_len)
# dataset = hub.dataset.SQUAD(version_2_with_negative=True) # dataset = hub.dataset.SQUAD(version_2_with_negative=True)
reader = hub.reader.ReadingComprehensionReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len,
doc_stride=128,
max_query_length=64)
seq_output = outputs["sequence_output"]
# Setup feed list for data feeder
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Select fine-tune strategy, setup config and fine-tune # Select fine-tune strategy, setup config and fine-tune
strategy = hub.AdamWeightDecayStrategy( strategy = hub.AdamWeightDecayStrategy(
weight_decay=args.weight_decay, weight_decay=args.weight_decay,
...@@ -72,7 +66,7 @@ if __name__ == '__main__': ...@@ -72,7 +66,7 @@ if __name__ == '__main__':
# Setup RunConfig for PaddleHub Fine-tune API # Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig( config = hub.RunConfig(
eval_interval=300, eval_interval=100,
use_data_parallel=args.use_data_parallel, use_data_parallel=args.use_data_parallel,
use_cuda=args.use_gpu, use_cuda=args.use_gpu,
num_epoch=args.num_epoch, num_epoch=args.num_epoch,
...@@ -82,9 +76,8 @@ if __name__ == '__main__': ...@@ -82,9 +76,8 @@ if __name__ == '__main__':
# Define a reading comprehension fine-tune task by PaddleHub's API # Define a reading comprehension fine-tune task by PaddleHub's API
reading_comprehension_task = hub.ReadingComprehensionTask( reading_comprehension_task = hub.ReadingComprehensionTask(
data_reader=reader, dataset=dataset,
feature=seq_output, feature=outputs["sequence_output"],
feed_list=feed_list,
config=config, config=config,
sub_task="squad", sub_task="squad",
) )
......
...@@ -20,12 +20,6 @@ from __future__ import print_function ...@@ -20,12 +20,6 @@ from __future__ import print_function
import argparse import argparse
import ast import ast
import numpy as np
import os
import time
import paddle
import paddle.fluid as fluid
import paddlehub as hub import paddlehub as hub
# yapf: disable # yapf: disable
...@@ -43,27 +37,11 @@ if __name__ == '__main__': ...@@ -43,27 +37,11 @@ if __name__ == '__main__':
inputs, outputs, program = module.context( inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len) trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use RegressionReader to read dataset
dataset = hub.dataset.GLUE("STS-B")
reader = hub.reader.RegressionReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len)
# Construct transfer learning network # Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence. # Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output. # Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"] pooled_output = outputs["pooled_output"]
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Setup RunConfig for PaddleHub Fine-tune API # Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig( config = hub.RunConfig(
use_data_parallel=False, use_data_parallel=False,
...@@ -74,13 +52,22 @@ if __name__ == '__main__': ...@@ -74,13 +52,22 @@ if __name__ == '__main__':
# Define a regression fine-tune task by PaddleHub's API # Define a regression fine-tune task by PaddleHub's API
reg_task = hub.RegressionTask( reg_task = hub.RegressionTask(
data_reader=reader,
feature=pooled_output, feature=pooled_output,
feed_list=feed_list,
config=config, config=config,
) )
# Data to be prdicted # STS-B has provided the predict data, and the dataset has process it. If you want to process customized data,
data = [[d.text_a, d.text_b] for d in dataset.get_predict_examples()[:10]] # see the predict.py in text_classification demo
# Use the appropriate tokenizer to preprocess the data
print(reg_task.predict(data=data, return_result=True)) # For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
dataset = hub.dataset.GLUE(
"STS-B", tokenizer=tokenizer, max_seq_len=args.max_seq_len)
encoded_data = dataset.get_predict_records()[:10]
print(reg_task.predict(data=encoded_data))
...@@ -17,7 +17,6 @@ ...@@ -17,7 +17,6 @@
import argparse import argparse
import ast import ast
import paddle.fluid as fluid
import paddlehub as hub import paddlehub as hub
# yapf: disable # yapf: disable
...@@ -41,27 +40,24 @@ if __name__ == '__main__': ...@@ -41,27 +40,24 @@ if __name__ == '__main__':
inputs, outputs, program = module.context( inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len) trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use RegressionReader to read dataset # Use the appropriate tokenizer to preprocess the data set
dataset = hub.dataset.GLUE("STS-B") # For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
reader = hub.reader.RegressionReader( if module.name == "ernie_tiny":
dataset=dataset, tokenizer = hub.ErnieTinyTokenizer(
vocab_path=module.get_vocab_path(), vocab_file=module.get_vocab_path(),
max_seq_len=args.max_seq_len) spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
dataset = hub.dataset.GLUE(
"STS-B", tokenizer=tokenizer, max_seq_len=args.max_seq_len)
# Construct transfer learning network # Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence. # Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output. # Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"] pooled_output = outputs["pooled_output"]
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Select fine-tune strategy, setup config and fine-tune # Select fine-tune strategy, setup config and fine-tune
strategy = hub.AdamWeightDecayStrategy( strategy = hub.AdamWeightDecayStrategy(
warmup_proportion=args.warmup_proportion, warmup_proportion=args.warmup_proportion,
...@@ -70,7 +66,6 @@ if __name__ == '__main__': ...@@ -70,7 +66,6 @@ if __name__ == '__main__':
# Setup RunConfig for PaddleHub Fine-tune API # Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig( config = hub.RunConfig(
eval_interval=300,
use_data_parallel=args.use_data_parallel, use_data_parallel=args.use_data_parallel,
use_cuda=args.use_gpu, use_cuda=args.use_gpu,
num_epoch=args.num_epoch, num_epoch=args.num_epoch,
...@@ -80,10 +75,7 @@ if __name__ == '__main__': ...@@ -80,10 +75,7 @@ if __name__ == '__main__':
# Define a regression fine-tune task by PaddleHub's API # Define a regression fine-tune task by PaddleHub's API
reg_task = hub.RegressionTask( reg_task = hub.RegressionTask(
data_reader=reader, dataset=dataset, feature=pooled_output, config=config)
feature=pooled_output,
feed_list=feed_list,
config=config)
# Fine-tune and evaluate by PaddleHub's API # Fine-tune and evaluate by PaddleHub's API
# will finish training, evaluation, testing, save model automatically # will finish training, evaluation, testing, save model automatically
......
...@@ -42,30 +42,16 @@ if __name__ == '__main__': ...@@ -42,30 +42,16 @@ if __name__ == '__main__':
module = hub.Module(name="ernie_tiny") module = hub.Module(name="ernie_tiny")
inputs, outputs, program = module.context(max_seq_len=args.max_seq_len) inputs, outputs, program = module.context(max_seq_len=args.max_seq_len)
# Sentence labeling dataset reader # Download dataset and get its label list and label num
# If you just want labels information, you can omit its tokenizer parameter to avoid preprocessing the train set.
dataset = hub.dataset.MSRA_NER() dataset = hub.dataset.MSRA_NER()
reader = hub.reader.SequenceLabelReader( num_classes = dataset.num_labels
dataset=dataset, label_list = dataset.get_labels()
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len,
sp_model_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
inv_label_map = {val: key for key, val in reader.label_map.items()}
# Construct transfer learning network # Construct transfer learning network
# Use "sequence_output" for token-level output. # Use "sequence_output" for token-level output.
sequence_output = outputs["sequence_output"] sequence_output = outputs["sequence_output"]
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Setup RunConfig for PaddleHub Fine-tune API # Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig( config = hub.RunConfig(
use_data_parallel=False, use_data_parallel=False,
...@@ -77,33 +63,31 @@ if __name__ == '__main__': ...@@ -77,33 +63,31 @@ if __name__ == '__main__':
# Define a sequence labeling fine-tune task by PaddleHub's API # Define a sequence labeling fine-tune task by PaddleHub's API
# if add crf, the network use crf as decoder # if add crf, the network use crf as decoder
seq_label_task = hub.SequenceLabelTask( seq_label_task = hub.SequenceLabelTask(
data_reader=reader,
feature=sequence_output, feature=sequence_output,
feed_list=feed_list,
max_seq_len=args.max_seq_len, max_seq_len=args.max_seq_len,
num_classes=dataset.num_labels, num_classes=num_classes,
config=config, config=config,
add_crf=False) add_crf=False)
# Data to be predicted # Data to be predicted
# If using python 2, prefix "u" is necessary text_a = [
data = [ "我们变而以书会友,以书结缘,把欧美、港台流行的食品类图谱、画册、工具书汇集一堂。",
[u"我们变而以书会友,以书结缘,把欧美、港台流行的食品类图谱、画册、工具书汇集一堂。"], "为了跟踪国际最新食品工艺、流行趋势,大量搜集海外专业书刊资料是提高技艺的捷径。",
[u"为了跟踪国际最新食品工艺、流行趋势,大量搜集海外专业书刊资料是提高技艺的捷径。"], "其中线装古籍逾千册;民国出版物几百种;珍本四册、稀见本四百余册,出版时间跨越三百余年。",
[u"其中线装古籍逾千册;民国出版物几百种;珍本四册、稀见本四百余册,出版时间跨越三百余年。"], "有的古木交柯,春机荣欣,从诗人句中得之,而入画中,观之令人心驰。",
[u"有的古木交柯,春机荣欣,从诗人句中得之,而入画中,观之令人心驰。"], "不过重在晋趣,略增明人气息,妙在集古有道、不露痕迹罢了。",
[u"不过重在晋趣,略增明人气息,妙在集古有道、不露痕迹罢了。"],
] ]
# Add 0x02 between characters to match the format of training data, # Add 0x02 between characters to match the format of training data,
# otherwise the length of prediction results will not match the input string # otherwise the length of prediction results will not match the input string
# if the input string contains non-Chinese characters. # if the input string contains non-Chinese characters.
tmp_data = [] formatted_text_a = list(map("\002".join, text_a))
for example in data:
formatted = []
for sentence in example:
formatted.append('\x02'.join(list(sentence)))
tmp_data.append(formatted)
data = tmp_data
print(seq_label_task.predict(data=data, return_result=True)) # Use the appropriate tokenizer to preprocess the data
# For ernie_tiny, it use BertTokenizer too.
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
encoded_data = [
tokenizer.encode(text=text, max_seq_len=args.max_seq_len)
for text in formatted_text_a
]
print(seq_label_task.predict(data=encoded_data, label_list=label_list))
...@@ -40,26 +40,16 @@ if __name__ == '__main__': ...@@ -40,26 +40,16 @@ if __name__ == '__main__':
inputs, outputs, program = module.context( inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len) trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use SequenceLabelReader to read dataset # Use the appropriate tokenizer to preprocess the data set
dataset = hub.dataset.MSRA_NER() # For ernie_tiny, it use BertTokenizer too.
reader = hub.reader.SequenceLabelReader( tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
dataset=dataset, dataset = hub.dataset.MSRA_NER(
vocab_path=module.get_vocab_path(), tokenizer=tokenizer, max_seq_len=args.max_seq_len)
max_seq_len=args.max_seq_len,
sp_model_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
# Construct transfer learning network # Construct transfer learning network
# Use "sequence_output" for token-level output. # Use "sequence_output" for token-level output.
sequence_output = outputs["sequence_output"] sequence_output = outputs["sequence_output"]
# Setup feed list for data feeder
# Must feed all the tensor of module need
feed_list = [
inputs["input_ids"].name, inputs["position_ids"].name,
inputs["segment_ids"].name, inputs["input_mask"].name
]
# Select a fine-tune strategy # Select a fine-tune strategy
strategy = hub.AdamWeightDecayStrategy( strategy = hub.AdamWeightDecayStrategy(
warmup_proportion=args.warmup_proportion, warmup_proportion=args.warmup_proportion,
...@@ -78,9 +68,8 @@ if __name__ == '__main__': ...@@ -78,9 +68,8 @@ if __name__ == '__main__':
# Define a sequence labeling fine-tune task by PaddleHub's API # Define a sequence labeling fine-tune task by PaddleHub's API
# If add crf, the network use crf as decoder # If add crf, the network use crf as decoder
seq_label_task = hub.SequenceLabelTask( seq_label_task = hub.SequenceLabelTask(
data_reader=reader, dataset=dataset,
feature=sequence_output, feature=sequence_output,
feed_list=feed_list,
max_seq_len=args.max_seq_len, max_seq_len=args.max_seq_len,
num_classes=dataset.num_labels, num_classes=dataset.num_labels,
config=config, config=config,
......
...@@ -21,9 +21,9 @@ parser.add_argument("--max_seq_len", type=int, default=512, ...@@ -21,9 +21,9 @@ parser.add_argument("--max_seq_len", type=int, default=512,
# yapf: enable. # yapf: enable.
class TransformerSequenceLabelLayer(fluid.dygraph.Layer): class TransformerSeqLabeling(fluid.dygraph.Layer):
def __init__(self, num_classes, transformer): def __init__(self, num_classes, transformer):
super(TransformerSequenceLabelLayer, self).__init__() super(TransformerSeqLabeling, self).__init__()
self.num_classes = num_classes self.num_classes = num_classes
self.transformer = transformer self.transformer = transformer
self.fc = Linear(input_dim=768, output_dim=num_classes) self.fc = Linear(input_dim=768, output_dim=num_classes)
...@@ -39,11 +39,15 @@ class TransformerSequenceLabelLayer(fluid.dygraph.Layer): ...@@ -39,11 +39,15 @@ class TransformerSequenceLabelLayer(fluid.dygraph.Layer):
def finetune(args): def finetune(args):
ernie = hub.Module(name="ernie", max_seq_len=args.max_seq_len) module = hub.Module(name="ernie", max_seq_len=args.max_seq_len)
# Use the appropriate tokenizer to preprocess the data set
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
dataset = hub.dataset.MSRA_NER(
tokenizer=tokenizer, max_seq_len=args.max_seq_len)
with fluid.dygraph.guard(): with fluid.dygraph.guard():
dataset = hub.dataset.MSRA_NER() ts = TransformerSeqLabeling(
ts = TransformerSequenceLabelLayer( num_classes=dataset.num_labels, transformer=module)
num_classes=dataset.num_labels, transformer=ernie)
adam = AdamOptimizer(learning_rate=1e-5, parameter_list=ts.parameters()) adam = AdamOptimizer(learning_rate=1e-5, parameter_list=ts.parameters())
state_dict_path = os.path.join(args.checkpoint_dir, state_dict_path = os.path.join(args.checkpoint_dir,
'dygraph_state_dict') 'dygraph_state_dict')
...@@ -51,34 +55,32 @@ def finetune(args): ...@@ -51,34 +55,32 @@ def finetune(args):
state_dict, _ = fluid.load_dygraph(state_dict_path) state_dict, _ = fluid.load_dygraph(state_dict_path)
ts.load_dict(state_dict) ts.load_dict(state_dict)
reader = hub.reader.SequenceLabelReader(
dataset=dataset,
vocab_path=ernie.get_vocab_path(),
max_seq_len=args.max_seq_len,
sp_model_path=ernie.get_spm_path(),
word_dict_path=ernie.get_word_dict_path())
train_reader = reader.data_generator(
batch_size=args.batch_size, phase='train')
loss_sum = total_infer = total_label = total_correct = cnt = 0 loss_sum = total_infer = total_label = total_correct = cnt = 0
# 执行epoch_num次训练
for epoch in range(args.num_epoch): for epoch in range(args.num_epoch):
# 读取训练数据进行训练 for batch_id, data in enumerate(
for batch_id, data in enumerate(train_reader()): dataset.batch_records_generator(
input_ids = np.array(data[0][0]).astype(np.int64) phase="train",
position_ids = np.array(data[0][1]).astype(np.int64) batch_size=args.batch_size,
segment_ids = np.array(data[0][2]).astype(np.int64) shuffle=True,
input_mask = np.array(data[0][3]).astype(np.float32) pad_to_batch_max_seq_len=False)):
labels = np.array(data[0][4]).astype(np.int64).reshape(-1, 1) batch_size = len(data["input_ids"])
seq_len = np.squeeze( input_ids = np.array(data["input_ids"]).astype(
np.array(data[0][5]).astype(np.int64), axis=1) np.int64).reshape([batch_size, -1, 1])
position_ids = np.array(data["position_ids"]).astype(
np.int64).reshape([batch_size, -1, 1])
segment_ids = np.array(data["segment_ids"]).astype(
np.int64).reshape([batch_size, -1, 1])
input_mask = np.array(data["input_mask"]).astype(
np.float32).reshape([batch_size, -1, 1])
labels = np.array(data["label"]).astype(np.int64).reshape(-1, 1)
seq_len = np.array(data["seq_len"]).astype(np.int64).reshape(
-1, 1)
pred, ret_infers = ts(input_ids, position_ids, segment_ids, pred, ret_infers = ts(input_ids, position_ids, segment_ids,
input_mask) input_mask)
loss = fluid.layers.cross_entropy(pred, to_variable(labels)) loss = fluid.layers.cross_entropy(pred, to_variable(labels))
avg_loss = fluid.layers.mean(loss) avg_loss = fluid.layers.mean(loss)
avg_loss.backward() avg_loss.backward()
# 参数更新
adam.minimize(avg_loss) adam.minimize(avg_loss)
loss_sum += avg_loss.numpy() * labels.shape[0] loss_sum += avg_loss.numpy() * labels.shape[0]
......
...@@ -20,11 +20,7 @@ from __future__ import print_function ...@@ -20,11 +20,7 @@ from __future__ import print_function
import argparse import argparse
import ast import ast
import numpy as np
import os
import time
import paddle
import paddle.fluid as fluid
import paddlehub as hub import paddlehub as hub
# yapf: disable # yapf: disable
...@@ -43,32 +39,11 @@ if __name__ == '__main__': ...@@ -43,32 +39,11 @@ if __name__ == '__main__':
inputs, outputs, program = module.context( inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len) trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use accuracy as metrics # Download dataset and get its label list and label num
# Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC # If you just want labels information, you can omit its tokenizer parameter to avoid preprocessing the train set.
dataset = hub.dataset.ChnSentiCorp() dataset = hub.dataset.ChnSentiCorp()
num_classes = dataset.num_labels
# For ernie_tiny, it use sub-word to tokenize chinese sentence label_list = dataset.get_labels()
# If not ernie tiny, sp_model_path and word_dict_path should be set None
reader = hub.reader.ClassifyReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len,
sp_model_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"]
# Setup feed list for data feeder
# Must feed all the tensor of module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Setup RunConfig for PaddleHub Fine-tune API # Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig( config = hub.RunConfig(
...@@ -80,14 +55,26 @@ if __name__ == '__main__': ...@@ -80,14 +55,26 @@ if __name__ == '__main__':
# Define a classfication fine-tune task by PaddleHub's API # Define a classfication fine-tune task by PaddleHub's API
cls_task = hub.TextClassifierTask( cls_task = hub.TextClassifierTask(
data_reader=reader, feature=outputs["pooled_output"],
feature=pooled_output, num_classes=num_classes,
feed_list=feed_list,
num_classes=dataset.num_labels,
config=config) config=config)
# Data to be prdicted # Data to be prdicted
data = [["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"], ["交通方便;环境很好;服务态度很好 房间较小"], text_a = [
["19天硬盘就罢工了~~~算上运来的一周都没用上15天~~~可就是不能换了~~~唉~~~~你说这算什么事呀~~~"]] "这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般", "交通方便;环境很好;服务态度很好 房间较小",
"19天硬盘就罢工了~~~算上运来的一周都没用上15天~~~可就是不能换了~~~唉~~~~你说这算什么事呀~~~"
print(cls_task.predict(data=data, return_result=True)) ]
# Use the appropriate tokenizer to preprocess the data
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
encoded_data = [
tokenizer.encode(text=text, max_seq_len=args.max_seq_len)
for text in text_a
]
print(cls_task.predict(data=encoded_data, label_list=label_list))
...@@ -20,11 +20,7 @@ from __future__ import print_function ...@@ -20,11 +20,7 @@ from __future__ import print_function
import argparse import argparse
import ast import ast
import numpy as np
import os
import time
import paddle
import paddle.fluid as fluid
import paddlehub as hub import paddlehub as hub
# yapf: disable # yapf: disable
...@@ -44,33 +40,17 @@ if __name__ == '__main__': ...@@ -44,33 +40,17 @@ if __name__ == '__main__':
inputs, outputs, program = module.context( inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len) trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use accuracy as metrics # Download dataset and get its label list and label num
# Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC # If you just want labels information, you can omit its tokenizer parameter to avoid preprocessing the train set.
dataset = hub.dataset.ChnSentiCorp() dataset = hub.dataset.ChnSentiCorp()
num_classes = dataset.num_labels
# For ernie_tiny, it use sub-word to tokenize chinese sentence label_list = dataset.get_labels()
# If not ernie tiny, sp_model_path and word_dict_path should be set None
reader = hub.reader.ClassifyReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len,
sp_model_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
# Construct transfer learning network # Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence. # Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output. # Use "sequence_output" for token-level output.
token_feature = outputs["sequence_output"] token_feature = outputs["sequence_output"]
# Setup feed list for data feeder
# Must feed all the tensor of module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Setup RunConfig for PaddleHub Fine-tune API # Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig( config = hub.RunConfig(
use_data_parallel=args.use_data_parallel, use_data_parallel=args.use_data_parallel,
...@@ -85,15 +65,27 @@ if __name__ == '__main__': ...@@ -85,15 +65,27 @@ if __name__ == '__main__':
# you must use the outputs["sequence_output"] as the token_feature of TextClassifierTask, # you must use the outputs["sequence_output"] as the token_feature of TextClassifierTask,
# rather than outputs["pooled_output"], and feature is None # rather than outputs["pooled_output"], and feature is None
cls_task = hub.TextClassifierTask( cls_task = hub.TextClassifierTask(
data_reader=reader,
token_feature=token_feature, token_feature=token_feature,
feed_list=feed_list,
network=args.network, network=args.network,
num_classes=dataset.num_labels, num_classes=dataset.num_labels,
config=config) config=config)
# Data to be prdicted # Data to be prdicted
data = [["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"], ["交通方便;环境很好;服务态度很好 房间较小"], text_a = [
["19天硬盘就罢工了~~~算上运来的一周都没用上15天~~~可就是不能换了~~~唉~~~~你说这算什么事呀~~~"]] "这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般", "交通方便;环境很好;服务态度很好 房间较小",
"19天硬盘就罢工了~~~算上运来的一周都没用上15天~~~可就是不能换了~~~唉~~~~你说这算什么事呀~~~"
print(cls_task.predict(data=data, return_result=True)) ]
# Use the appropriate tokenizer to preprocess the data
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
encoded_data = [
tokenizer.encode(text=text, max_seq_len=args.max_seq_len)
for text in text_a
]
print(cls_task.predict(data=encoded_data, label_list=label_list))
...@@ -7,5 +7,5 @@ python -u predict_predefine_net.py \ ...@@ -7,5 +7,5 @@ python -u predict_predefine_net.py \
--checkpoint_dir=$CKPT_DIR \ --checkpoint_dir=$CKPT_DIR \
--max_seq_len=128 \ --max_seq_len=128 \
--use_gpu=True \ --use_gpu=True \
--batch_size=24 \ --batch_size=1 \
--network=bilstm --network=bilstm
...@@ -40,11 +40,23 @@ class TransformerClassifier(fluid.dygraph.Layer): ...@@ -40,11 +40,23 @@ class TransformerClassifier(fluid.dygraph.Layer):
def finetune(args): def finetune(args):
ernie = hub.Module(name="ernie", max_seq_len=args.max_seq_len) module = hub.Module(name="ernie", max_seq_len=args.max_seq_len)
# Use the appropriate tokenizer to preprocess the data set
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path(),
)
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
dataset = hub.dataset.ChnSentiCorp(
tokenizer=tokenizer, max_seq_len=args.max_seq_len)
with fluid.dygraph.guard(): with fluid.dygraph.guard():
dataset = hub.dataset.ChnSentiCorp()
tc = TransformerClassifier( tc = TransformerClassifier(
num_classes=dataset.num_labels, transformer=ernie) num_classes=dataset.num_labels, transformer=module)
adam = AdamOptimizer(learning_rate=1e-5, parameter_list=tc.parameters()) adam = AdamOptimizer(learning_rate=1e-5, parameter_list=tc.parameters())
state_dict_path = os.path.join(args.checkpoint_dir, state_dict_path = os.path.join(args.checkpoint_dir,
'dygraph_state_dict') 'dygraph_state_dict')
...@@ -52,32 +64,31 @@ def finetune(args): ...@@ -52,32 +64,31 @@ def finetune(args):
state_dict, _ = fluid.load_dygraph(state_dict_path) state_dict, _ = fluid.load_dygraph(state_dict_path)
tc.load_dict(state_dict) tc.load_dict(state_dict)
reader = hub.reader.ClassifyReader(
dataset=dataset,
vocab_path=ernie.get_vocab_path(),
max_seq_len=args.max_seq_len,
sp_model_path=ernie.get_spm_path(),
word_dict_path=ernie.get_word_dict_path())
train_reader = reader.data_generator(
batch_size=args.batch_size, phase='train')
loss_sum = acc_sum = cnt = 0 loss_sum = acc_sum = cnt = 0
# 执行epoch_num次训练
for epoch in range(args.num_epoch): for epoch in range(args.num_epoch):
# 读取训练数据进行训练 for batch_id, data in enumerate(
for batch_id, data in enumerate(train_reader()): dataset.batch_records_generator(
input_ids = np.array(data[0][0]).astype(np.int64) phase="train",
position_ids = np.array(data[0][1]).astype(np.int64) batch_size=args.batch_size,
segment_ids = np.array(data[0][2]).astype(np.int64) shuffle=True,
input_mask = np.array(data[0][3]).astype(np.float32) pad_to_batch_max_seq_len=False)):
labels = np.array(data[0][4]).astype(np.int64) batch_size = len(data["input_ids"])
input_ids = np.array(data["input_ids"]).astype(
np.int64).reshape([batch_size, -1, 1])
position_ids = np.array(data["position_ids"]).astype(
np.int64).reshape([batch_size, -1, 1])
segment_ids = np.array(data["segment_ids"]).astype(
np.int64).reshape([batch_size, -1, 1])
input_mask = np.array(data["input_mask"]).astype(
np.float32).reshape([batch_size, -1, 1])
labels = np.array(data["label"]).astype(np.int64).reshape(
[batch_size, 1])
pred = tc(input_ids, position_ids, segment_ids, input_mask) pred = tc(input_ids, position_ids, segment_ids, input_mask)
acc = fluid.layers.accuracy(pred, to_variable(labels)) acc = fluid.layers.accuracy(pred, to_variable(labels))
loss = fluid.layers.cross_entropy(pred, to_variable(labels)) loss = fluid.layers.cross_entropy(pred, to_variable(labels))
avg_loss = fluid.layers.mean(loss) avg_loss = fluid.layers.mean(loss)
avg_loss.backward() avg_loss.backward()
# 参数更新
adam.minimize(avg_loss) adam.minimize(avg_loss)
loss_sum += avg_loss.numpy() * labels.shape[0] loss_sum += avg_loss.numpy() * labels.shape[0]
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
import argparse import argparse
import ast import ast
import paddlehub as hub import paddlehub as hub
# yapf: disable # yapf: disable
...@@ -39,35 +40,24 @@ if __name__ == '__main__': ...@@ -39,35 +40,24 @@ if __name__ == '__main__':
inputs, outputs, program = module.context( inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len) trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use accuracy as metrics # Use the appropriate tokenizer to preprocess the data set
# Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC # For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
# metric should be acc, f1 or matthews if module.name == "ernie_tiny":
dataset = hub.dataset.ChnSentiCorp() tokenizer = hub.ErnieTinyTokenizer(
metrics_choices = ["acc"] vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
# For ernie_tiny, it use sub-word to tokenize chinese sentence dataset = hub.dataset.ChnSentiCorp(
# If not ernie tiny, sp_model_path and word_dict_path should be set None tokenizer=tokenizer, max_seq_len=args.max_seq_len)
reader = hub.reader.ClassifyReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len,
sp_model_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
# Construct transfer learning network # Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence. # Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output. # Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"] pooled_output = outputs["pooled_output"]
# Setup feed list for data feeder
# Must feed all the tensor of module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Select fine-tune strategy, setup config and fine-tune # Select fine-tune strategy, setup config and fine-tune
strategy = hub.AdamWeightDecayStrategy( strategy = hub.AdamWeightDecayStrategy(
warmup_proportion=args.warmup_proportion, warmup_proportion=args.warmup_proportion,
...@@ -85,12 +75,11 @@ if __name__ == '__main__': ...@@ -85,12 +75,11 @@ if __name__ == '__main__':
# Define a classfication fine-tune task by PaddleHub's API # Define a classfication fine-tune task by PaddleHub's API
cls_task = hub.TextClassifierTask( cls_task = hub.TextClassifierTask(
data_reader=reader, dataset=dataset,
feature=pooled_output, feature=pooled_output,
feed_list=feed_list,
num_classes=dataset.num_labels, num_classes=dataset.num_labels,
config=config, config=config,
metrics_choices=metrics_choices) metrics_choices=["acc"])
# Fine-tune and evaluate by PaddleHub's API # Fine-tune and evaluate by PaddleHub's API
# will finish training, evaluation, testing, save model automatically # will finish training, evaluation, testing, save model automatically
......
...@@ -40,35 +40,24 @@ if __name__ == '__main__': ...@@ -40,35 +40,24 @@ if __name__ == '__main__':
inputs, outputs, program = module.context( inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len) trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use accuracy as metrics # Use the appropriate tokenizer to preprocess the data set
# Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC # For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
# metric should be acc, f1 or matthews if module.name == "ernie_tiny":
dataset = hub.dataset.ChnSentiCorp() tokenizer = hub.ErnieTinyTokenizer(
metrics_choices = ["acc"] vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
# For ernie_tiny, it use sub-word to tokenize chinese sentence dataset = hub.dataset.ChnSentiCorp(
# If not ernie tiny, sp_model_path and word_dict_path should be set None tokenizer=tokenizer, max_seq_len=args.max_seq_len)
reader = hub.reader.ClassifyReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len,
sp_model_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
# Construct transfer learning network # Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence. # Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output. # Use "sequence_output" for token-level output.
token_feature = outputs["sequence_output"] token_feature = outputs["sequence_output"]
# Setup feed list for data feeder
# Must feed all the tensor of module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Select fine-tune strategy, setup config and fine-tune # Select fine-tune strategy, setup config and fine-tune
strategy = hub.AdamWeightDecayStrategy( strategy = hub.AdamWeightDecayStrategy(
warmup_proportion=args.warmup_proportion, warmup_proportion=args.warmup_proportion,
...@@ -90,13 +79,12 @@ if __name__ == '__main__': ...@@ -90,13 +79,12 @@ if __name__ == '__main__':
# you must use the outputs["sequence_output"] as the token_feature of TextClassifierTask, # you must use the outputs["sequence_output"] as the token_feature of TextClassifierTask,
# rather than outputs["pooled_output"], and feature is None # rather than outputs["pooled_output"], and feature is None
cls_task = hub.TextClassifierTask( cls_task = hub.TextClassifierTask(
data_reader=reader, dataset=dataset,
token_feature=token_feature, token_feature=token_feature,
feed_list=feed_list,
network=args.network, network=args.network,
num_classes=dataset.num_labels, num_classes=dataset.num_labels,
config=config, config=config,
metrics_choices=metrics_choices) metrics_choices=["acc"])
# Fine-tune and evaluate by PaddleHub's API # Fine-tune and evaluate by PaddleHub's API
# will finish training, evaluation, testing, save model automatically # will finish training, evaluation, testing, save model automatically
......
...@@ -31,6 +31,7 @@ from . import dataset ...@@ -31,6 +31,7 @@ from . import dataset
from . import finetune from . import finetune
from . import reader from . import reader
from . import network from . import network
from . import tokenizer
from .common.dir import USER_HOME from .common.dir import USER_HOME
from .common.dir import HUB_HOME from .common.dir import HUB_HOME
...@@ -70,3 +71,6 @@ from .finetune.strategy import CombinedStrategy ...@@ -70,3 +71,6 @@ from .finetune.strategy import CombinedStrategy
from .autofinetune.evaluator import report_final_result from .autofinetune.evaluator import report_final_result
from .module.nlp_module import NLPPredictionModule, TransformerModule from .module.nlp_module import NLPPredictionModule, TransformerModule
from .tokenizer.bert_tokenizer import BertTokenizer
from .tokenizer.bert_tokenizer import ErnieTinyTokenizer
...@@ -19,7 +19,10 @@ from __future__ import print_function ...@@ -19,7 +19,10 @@ from __future__ import print_function
import io import io
import csv import csv
import collections
from tqdm import tqdm
import numpy as np
from paddlehub.dataset import InputExample, BaseDataset from paddlehub.dataset import InputExample, BaseDataset
from paddlehub.common.logger import logger from paddlehub.common.logger import logger
...@@ -36,7 +39,9 @@ class BaseNLPDataset(BaseDataset): ...@@ -36,7 +39,9 @@ class BaseNLPDataset(BaseDataset):
train_file_with_header=False, train_file_with_header=False,
dev_file_with_header=False, dev_file_with_header=False,
test_file_with_header=False, test_file_with_header=False,
predict_file_with_header=False): predict_file_with_header=False,
tokenizer=None,
max_seq_len=128):
super(BaseNLPDataset, self).__init__( super(BaseNLPDataset, self).__init__(
base_path=base_path, base_path=base_path,
train_file=train_file, train_file=train_file,
...@@ -49,6 +54,52 @@ class BaseNLPDataset(BaseDataset): ...@@ -49,6 +54,52 @@ class BaseNLPDataset(BaseDataset):
dev_file_with_header=dev_file_with_header, dev_file_with_header=dev_file_with_header,
test_file_with_header=test_file_with_header, test_file_with_header=test_file_with_header,
predict_file_with_header=predict_file_with_header) predict_file_with_header=predict_file_with_header)
self.tokenizer = tokenizer
self.max_seq_len = max_seq_len
self._train_records = None
self._dev_records = None
self._test_records = None
self._predict_records = None
@property
def train_records(self):
if not self._train_records:
examples = self.train_examples
if not self.tokenizer or not examples:
return []
logger.info("Processing the train set...")
self._train_records = self._convert_examples_to_records(examples)
return self._train_records
@property
def dev_records(self):
if not self._dev_records:
examples = self.dev_examples
if not self.tokenizer or not examples:
return []
logger.info("Processing the dev set...")
self._dev_records = self._convert_examples_to_records(examples)
return self._dev_records
@property
def test_records(self):
if not self._test_records:
examples = self.test_examples
if not self.tokenizer or not examples:
return []
logger.info("Processing the test set...")
self._test_records = self._convert_examples_to_records(examples)
return self._test_records
@property
def predict_records(self):
if not self._predict_records:
examples = self.predict_examples
if not self.tokenizer or not examples:
return []
logger.info("Processing the predict set...")
self._predict_records = self._convert_examples_to_records(examples)
return self._predict_records
def _read_file(self, input_file, phase=None): def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file.""" """Reads a tab separated value file."""
...@@ -96,3 +147,708 @@ class BaseNLPDataset(BaseDataset): ...@@ -96,3 +147,708 @@ class BaseNLPDataset(BaseDataset):
% (input_file)) % (input_file))
examples.append(example) examples.append(example)
return examples return examples
def _convert_examples_to_records(self, examples):
"""
Returns a list[dict] including all the input information what the model need.
Args:
examples (list): the data example, returned by _read_file.
Returns:
a list with all the examples record.
"""
records = []
for example in examples:
record = self.tokenizer.encode(
text=example.text_a,
text_pair=example.text_b,
max_seq_len=self.max_seq_len)
if example.label:
record["label"] = self.label_list.index(
example.label) if self.label_list else float(example.label)
records.append(record)
return records
def get_train_records(self, shuffle=False):
return self.get_records("train", shuffle=shuffle)
def get_dev_records(self, shuffle=False):
return self.get_records("dev", shuffle=shuffle)
def get_test_records(self, shuffle=False):
return self.get_records("test", shuffle=shuffle)
def get_val_records(self, shuffle=False):
return self.get_records("val", shuffle=shuffle)
def get_predict_records(self, shuffle=False):
return self.get_records("predict", shuffle=shuffle)
def get_records(self, phase, shuffle=False):
if phase == "train":
records = self.train_records
elif phase == "dev":
records = self.dev_records
elif phase == "test":
records = self.test_records
elif phase == "val":
records = self.dev_records
elif phase == "predict":
records = self.predict_records
else:
raise ValueError("Invalid phase: %s" % phase)
if shuffle:
np.random.shuffle(records)
return records
def get_feed_list(self, phase):
records = self.get_records(phase)
if records:
feed_list = list(records[0].keys())
else:
if phase == "predict":
feed_list = [
feed_name for feed_name in self.get_feed_list("train")
if feed_name != "label"
]
else:
feed_list = [
feed_name for feed_name in self.get_feed_list("train")
]
return feed_list
def batch_records_generator(self,
phase,
batch_size,
shuffle=True,
pad_to_batch_max_seq_len=False):
""" generate a batch of records, usually used in dynamic graph mode.
Args:
phase (str): the dataset phase, can be "train", "dev", "val", "test" or "predict".
batch_size (int): the data batch size
shuffle (bool): if set to True, will shuffle the dataset.
pad_to_batch_max_seq_len (bool): if set to True, will dynamically pad to the max sequence length of the batch data.
Only recommended to set to True when the model has used RNN.
"""
records = self.get_records(phase, shuffle=shuffle)
batch_records = []
batch_lens = []
for record in records:
batch_records.append(record)
if pad_to_batch_max_seq_len:
# This may reduce the processing speed
tokens_wo_pad = [
token for token in self.tokenizer.decode(
record, only_convert_to_tokens=True)
if token != self.tokenizer.pad_token
]
batch_lens.append(len(tokens_wo_pad))
if len(batch_records) == batch_size:
if pad_to_batch_max_seq_len:
# This may reduce the processing speed.
batch_max_seq_len = max(batch_lens)
for record in batch_records:
for key, value in record.items():
if isinstance(value, list):
# This may not be universal
record[key] = value[:batch_max_seq_len]
rev_batch_records = {
key: [record[key] for record in batch_records]
for key in batch_records[0]
}
yield rev_batch_records
batch_records = []
batch_lens = []
if batch_records:
if pad_to_batch_max_seq_len:
# This may reduce the processing speed.
batch_max_seq_len = max(batch_lens)
for record in batch_records:
for key in record.keys():
if isinstance(record[key], list):
record[key] = record[key][:batch_max_seq_len]
rev_batch_records = {
key: [record[key] for record in batch_records]
for key in batch_records[0]
}
yield rev_batch_records
class TextClassificationDataset(BaseNLPDataset):
def _convert_examples_to_records(self, examples):
"""
Returns a list[dict] including all the input information what the model need.
Args:
examples (list): the data example, returned by _read_file.
Returns:
a list with all the examples record.
"""
records = []
for example in examples:
record = self.tokenizer.encode(
text=example.text_a,
text_pair=example.text_b,
max_seq_len=self.max_seq_len)
if example.label:
record["label"] = self.label_list.index(example.label)
records.append(record)
return records
class RegressionDataset(BaseNLPDataset):
def _convert_examples_to_records(self, examples):
"""
Returns a list[dict] including all the input information what the model need.
Args:
examples (list): the data example, returned by _read_file.
Returns:
a list with all the examples record.
"""
records = []
for example in examples:
record = self.tokenizer.encode(
text=example.text_a,
text_pair=example.text_b,
max_seq_len=self.max_seq_len)
if example.label:
record["label"] = float(example.label)
records.append(record)
return records
class SeqLabelingDataset(BaseNLPDataset):
def __init__(self,
base_path,
train_file=None,
dev_file=None,
test_file=None,
predict_file=None,
label_file=None,
label_list=None,
train_file_with_header=False,
dev_file_with_header=False,
test_file_with_header=False,
predict_file_with_header=False,
tokenizer=None,
max_seq_len=128,
split_char="\002",
no_entity_label="O"):
self.no_entity_label = no_entity_label
self.split_char = split_char
super(SeqLabelingDataset, self).__init__(
base_path=base_path,
train_file=train_file,
dev_file=dev_file,
test_file=test_file,
predict_file=predict_file,
label_file=label_file,
label_list=label_list,
train_file_with_header=train_file_with_header,
dev_file_with_header=dev_file_with_header,
test_file_with_header=test_file_with_header,
predict_file_with_header=predict_file_with_header,
tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _convert_examples_to_records(self, examples):
"""
Returns a list[dict] including all the input information what the model need.
Args:
examples (list): the data examples, returned by _read_file.
Returns:
a list with all the examples record.
"""
records = []
for example in examples:
tokens, labels = self._reseg_token_label(
tokens=example.text_a.split(self.split_char),
labels=example.label.split(self.split_char))
record = self.tokenizer.encode(
text=tokens, max_seq_len=self.max_seq_len)
if labels:
record["label"] = []
tokens_with_specical_token = self.tokenizer.decode(
record, only_convert_to_tokens=True)
tokens_index = 0
for token in tokens_with_specical_token:
if tokens_index < len(
tokens) and token == tokens[tokens_index]:
record["label"].append(
self.label_list.index(labels[tokens_index]))
tokens_index += 1
else:
record["label"].append(
self.label_list.index(self.no_entity_label))
records.append(record)
return records
def _reseg_token_label(self, tokens, labels=None):
if labels:
if len(tokens) != len(labels):
raise ValueError(
"The length of tokens must be same with labels")
ret_tokens = []
ret_labels = []
for token, label in zip(tokens, labels):
sub_token = self.tokenizer.tokenize(token)
if len(sub_token) == 0:
continue
ret_tokens.extend(sub_token)
ret_labels.append(label)
if len(sub_token) < 2:
continue
sub_label = label
if label.startswith("B-"):
sub_label = "I-" + label[2:]
ret_labels.extend([sub_label] * (len(sub_token) - 1))
if len(ret_tokens) != len(ret_labels):
raise ValueError(
"The length of ret_tokens can't match with labels")
return ret_tokens, ret_labels
else:
ret_tokens = []
for token in tokens:
sub_token = self.tokenizer.tokenize(token)
if len(sub_token) == 0:
continue
ret_tokens.extend(sub_token)
if len(sub_token) < 2:
continue
return ret_tokens, None
class MultiLabelDataset(BaseNLPDataset):
def _convert_examples_to_records(self, examples):
"""
Returns a list[dict] including all the input information what the model need.
Args:
examples (list): the data examples, returned by _read_file.
max_seq_len (int): padding to the max sequence length.
Returns:
a list with all the examples record.
"""
records = []
for example in examples:
record = self.tokenizer.encode(
text=example.text_a,
text_pair=example.text_b,
max_seq_len=self.max_seq_len)
if example.label:
record["label"] = [int(label) for label in example.label]
records.append(record)
return records
class MRCDataset(BaseNLPDataset):
def __init__(
self,
base_path,
train_file=None,
dev_file=None,
test_file=None,
predict_file=None,
label_file=None,
label_list=None,
train_file_with_header=False,
dev_file_with_header=False,
test_file_with_header=False,
predict_file_with_header=False,
tokenizer=None,
max_seq_len=128,
max_query_len=64,
doc_stride=128,
):
super(BaseNLPDataset, self).__init__(
base_path=base_path,
train_file=train_file,
dev_file=dev_file,
test_file=test_file,
predict_file=predict_file,
label_file=label_file,
label_list=label_list,
train_file_with_header=train_file_with_header,
dev_file_with_header=dev_file_with_header,
test_file_with_header=test_file_with_header,
predict_file_with_header=predict_file_with_header,
)
self.tokenizer = tokenizer
self.max_seq_len = max_seq_len
self.max_query_len = max_query_len
self._DocSpan = collections.namedtuple("DocSpan", ["start", "length"])
self.doc_stride = doc_stride
self._Feature = collections.namedtuple("Feature", [
"unique_id",
"example_index",
"doc_span_index",
"tokens",
"token_to_orig_map",
"token_is_max_context",
])
self.special_tokens_num, self.special_tokens_num_before_doc = self._get_special_tokens_num(
)
self._train_records = None
self._dev_records = None
self._test_records = None
self._predict_records = None
self._train_features = None
self._dev_features = None
self._test_features = None
self._predict_features = None
@property
def train_records(self):
if not self._train_records:
examples = self.train_examples
if not self.tokenizer or not examples:
return []
logger.info("Processing the train set...")
self._train_records, self._train_features = self._convert_examples_to_records_and_features(
examples, "train")
return self._train_records
@property
def dev_records(self):
if not self._dev_records:
examples = self.dev_examples
if not self.tokenizer or not examples:
return []
logger.info("Processing the dev set...")
self._dev_records, self._dev_features = self._convert_examples_to_records_and_features(
examples, "dev")
return self._dev_records
@property
def test_records(self):
if not self._test_records:
examples = self.test_examples
if not self.tokenizer or not examples:
return []
logger.info("Processing the test set...")
self._test_records, self._test_features = self._convert_examples_to_records_and_features(
examples, "test")
return self._test_records
@property
def predict_records(self):
if not self._predict_records:
examples = self.predict_examples
if not self.tokenizer or not examples:
return []
logger.info("Processing the predict set...")
self._predict_records, self._predict_features = self._convert_examples_to_records_and_features(
examples, "predict")
return self._predict_records
@property
def train_features(self):
if not self._train_features:
examples = self.train_examples
if not self.tokenizer or not examples:
return []
logger.info("Processing the train set...")
self._train_records, self._train_features = self._convert_examples_to_records_and_features(
examples, "train")
return self._train_features
@property
def dev_features(self):
if not self._dev_features:
examples = self.dev_examples
if not self.tokenizer or not examples:
return []
logger.info("Processing the dev set...")
self._dev_records, self._dev_features = self._convert_examples_to_records_and_features(
examples, "dev")
return self._dev_features
@property
def test_features(self):
if not self._test_features:
examples = self.test_examples
if not self.tokenizer or not examples:
return []
logger.info("Processing the test set...")
self._test_records, self._test_features = self._convert_examples_to_records_and_features(
examples, "test")
return self._test_features
@property
def predict_features(self):
if not self._predict_features:
examples = self.predict_examples
if not self.tokenizer or not examples:
return []
logger.info("Processing the predict set...")
self._predict_records, self._predict_features = self._convert_examples_to_records_and_features(
examples, "predict")
return self._predict_features
def _get_special_tokens_num(self):
if not self.tokenizer:
return None, None
# We must have a pad token, so we can use it to make fake text.
fake_question = [self.tokenizer.pad_token]
fake_answer = [self.tokenizer.pad_token]
special_tokens_num = 0
special_tokens_num_before_doc = 0
seen_pad_num = 0
fake_record = self.tokenizer.encode(fake_question, fake_answer)
fake_tokens_with_special_tokens = self.tokenizer.decode(
fake_record, only_convert_to_tokens=True)
for token in fake_tokens_with_special_tokens:
if token == self.tokenizer.pad_token:
seen_pad_num += 1
if seen_pad_num > 2:
# The third pad_token is added by padding
break
else:
special_tokens_num += 1
if seen_pad_num < 2:
# The second pad_token is the fake_answer
special_tokens_num_before_doc += 1
return special_tokens_num, special_tokens_num_before_doc
def _convert_examples_to_records_and_features(self, examples, phase):
"""Loads a data file into a list of `InputBatch`s."""
features = []
records = []
unique_id = 1000000000
with tqdm(total=len(examples)) as process_bar:
for (example_index, example) in enumerate(examples):
# Tokenize question_text
query_tokens = self.tokenizer.tokenize(example.question_text)
if len(query_tokens) > self.max_query_len:
query_tokens = query_tokens[0:self.max_query_len]
# Tokenize doc_tokens and get token-sub_token position map
tok_to_orig_index = []
orig_to_tok_index = []
all_doc_tokens = []
for (i, token) in enumerate(example.doc_tokens):
orig_to_tok_index.append(len(all_doc_tokens))
sub_tokens = self.tokenizer.tokenize(token)
for sub_token in sub_tokens:
tok_to_orig_index.append(i)
all_doc_tokens.append(sub_token)
# Update the answer position to the new sub_token position
tok_start_position = None
tok_end_position = None
is_impossible = example.is_impossible if hasattr(
example, "is_impossible") else False
if phase != "predict" and is_impossible:
tok_start_position = -1
tok_end_position = -1
if phase != "predict" and not is_impossible:
tok_start_position = orig_to_tok_index[
example.start_position]
if example.end_position < len(example.doc_tokens) - 1:
tok_end_position = orig_to_tok_index[
example.end_position + 1] - 1
else:
tok_end_position = len(all_doc_tokens) - 1
(tok_start_position,
tok_end_position) = self.improve_answer_span(
all_doc_tokens, tok_start_position, tok_end_position,
self.tokenizer, example.orig_answer_text)
# We can have documents that are longer than the maximum sequence length.
# To deal with this we do a sliding window approach, where we take chunks
# of the up to our max length with a stride of `doc_stride`.
# if hasattr(self.tokenizer, "num_special_tokens_to_add"):
max_tokens_for_doc = self.max_seq_len - len(
query_tokens) - self.special_tokens_num
doc_spans = []
start_offset = 0
while start_offset < len(all_doc_tokens):
length = len(all_doc_tokens) - start_offset
if length > max_tokens_for_doc:
length = max_tokens_for_doc
doc_spans.append(
self._DocSpan(start=start_offset, length=length))
if start_offset + length == len(all_doc_tokens):
break
start_offset += min(length, self.doc_stride)
for (doc_span_index, doc_span) in enumerate(doc_spans):
# Update the start_position and end_position to doc_span
start_position = None
end_position = None
if phase != "predict":
if is_impossible:
start_position = 0
end_position = 0
else:
# For training, if our document chunk does not contain an annotation
# we throw it out, since there is nothing to predict.
doc_start = doc_span.start
doc_end = doc_span.start + doc_span.length - 1
out_of_span = False
if not (tok_start_position >= doc_start
and tok_end_position <= doc_end):
out_of_span = True
if out_of_span:
start_position = 0
end_position = 0
else:
doc_offset = len(
query_tokens
) + self.special_tokens_num_before_doc
start_position = tok_start_position - doc_start + doc_offset
end_position = tok_end_position - doc_start + doc_offset
record = self.tokenizer.encode(
text=query_tokens,
text_pair=all_doc_tokens[doc_span.start:doc_span.start +
doc_span.length],
max_seq_len=self.max_seq_len)
record["start_position"] = start_position
record["end_position"] = end_position
record["unique_id"] = unique_id
records.append(record)
# The other information is saved in feature, which is helpful in postprocessing.
# The bridge with record and feature is unique_id.
tokens = self.tokenizer.decode(
record, only_convert_to_tokens=True)
token_to_orig_map = {}
token_is_max_context = {}
doc_token_start = len(
query_tokens) + self.special_tokens_num_before_doc
for i in range(doc_span.length):
# split_token_index: the doc token position in doc after tokenize
# doc_token_index: the doc token position in record after encode
split_token_index = doc_span.start + i
doc_token_index = doc_token_start + i
token_to_orig_map[doc_token_index] = tok_to_orig_index[
split_token_index]
is_max_context = self.check_is_max_context(
doc_spans, doc_span_index, split_token_index)
token_is_max_context[doc_token_index] = is_max_context
feature = self._Feature(
unique_id=unique_id,
example_index=example_index,
doc_span_index=doc_span_index,
tokens=tokens,
token_to_orig_map=token_to_orig_map,
token_is_max_context=token_is_max_context,
)
features.append(feature)
unique_id += 1
process_bar.update(1)
return records, features
def improve_answer_span(self, doc_tokens, input_start, input_end, tokenizer,
orig_answer_text):
"""Returns tokenized answer spans that better match the annotated answer."""
# The SQuAD annotations are character based. We first project them to
# whitespace-tokenized words. But then after WordPiece tokenization, we can
# often find a "better match". For example:
#
# Question: What year was John Smith born?
# Context: The leader was John Smith (1895-1943).
# Answer: 1895
#
# The original whitespace-tokenized answer will be "(1895-1943).". However
# after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
# the exact answer, 1895.
#
# However, this is not always possible. Consider the following:
#
# Question: What country is the top exporter of electornics?
# Context: The Japanese electronics industry is the lagest in the world.
# Answer: Japan
#
# In this case, the annotator chose "Japan" as a character sub-span of
# the word "Japanese". Since our WordPiece tokenizer does not split
# "Japanese", we just use "Japanese" as the annotation. This is fairly rare
# in SQuAD, but does happen.
tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
for new_start in range(input_start, input_end + 1):
for new_end in range(input_end, new_start - 1, -1):
text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
if text_span == tok_answer_text:
return (new_start, new_end)
return (input_start, input_end)
def check_is_max_context(self, doc_spans, cur_span_index, position):
"""Check if this is the 'max context' doc span for the token."""
# Because of the sliding window approach taken to scoring documents, a single
# token can appear in multiple documents. E.g.
# Doc: the man went to the store and bought a gallon of milk
# Span A: the man went to the
# Span B: to the store and bought
# Span C: and bought a gallon of
# ...
#
# Now the word 'bought' will have two scores from spans B and C. We only
# want to consider the score with "maximum context", which we define as
# the *minimum* of its left and right context (the *sum* of left and
# right context will always be the same, of course).
#
# In the example the maximum context for 'bought' would be span C since
# it has 1 left context and 3 right context, while span B has 4 left context
# and 0 right context.
best_score = None
best_span_index = None
for (span_index, doc_span) in enumerate(doc_spans):
end = doc_span.start + doc_span.length - 1
if position < doc_span.start:
continue
if position > end:
continue
num_left_context = position - doc_span.start
num_right_context = end - position
score = min(num_left_context,
num_right_context) + 0.01 * doc_span.length
if best_score is None or score > best_score:
best_score = score
best_span_index = span_index
return cur_span_index == best_span_index
def get_features(self, phase):
if phase == "train":
return self.train_features
elif phase == "dev":
return self.dev_features
elif phase == "test":
return self.test_features
elif phase == "val":
return self.dev_features
elif phase == "predict":
return self.predict_features
else:
raise ValueError("Invalid phase: %s" % phase)
...@@ -20,11 +20,16 @@ from __future__ import print_function ...@@ -20,11 +20,16 @@ from __future__ import print_function
import os import os
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
class BQ(BaseNLPDataset): class BQ(TextClassificationDataset):
def __init__(self): """
The Bank Question (BQ) corpus, a Chinese corpus for sentence semantic equivalence identification (SSEI),
contains 120,000 question pairs from 1-year online bank custom service logs.
"""
def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "bq") dataset_dir = os.path.join(DATA_HOME, "bq")
base_path = self._download_dataset( base_path = self._download_dataset(
dataset_dir, dataset_dir,
...@@ -36,18 +41,16 @@ class BQ(BaseNLPDataset): ...@@ -36,18 +41,16 @@ class BQ(BaseNLPDataset):
test_file="test.txt", test_file="test.txt",
label_file=None, label_file=None,
label_list=["0", "1"], label_list=["0", "1"],
) tokenizer=tokenizer,
max_seq_len=max_seq_len)
if __name__ == "__main__": if __name__ == "__main__":
ds = BQ() from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
print("first 10 dev") ds = BQ(tokenizer=BertTokenizer(vocab_file='vocab.txt'), max_seq_len=10)
print("first 10 dev examples")
for e in ds.get_dev_examples()[:10]: for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print("first 10 train") print("first 10 dev records")
for e in ds.get_train_examples()[:10]: for e in ds.get_dev_records()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print(e)
print("first 10 test")
for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds)
...@@ -23,16 +23,16 @@ import csv ...@@ -23,16 +23,16 @@ import csv
from paddlehub.dataset import InputExample from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
class ChnSentiCorp(BaseNLPDataset): class ChnSentiCorp(TextClassificationDataset):
""" """
ChnSentiCorp (by Tan Songbo at ICT of Chinese Academy of Sciences, and for ChnSentiCorp (by Tan Songbo at ICT of Chinese Academy of Sciences, and for
opinion mining) opinion mining)
""" """
def __init__(self): def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "chnsenticorp") dataset_dir = os.path.join(DATA_HOME, "chnsenticorp")
base_path = self._download_dataset( base_path = self._download_dataset(
dataset_dir, dataset_dir,
...@@ -44,7 +44,8 @@ class ChnSentiCorp(BaseNLPDataset): ...@@ -44,7 +44,8 @@ class ChnSentiCorp(BaseNLPDataset):
test_file="test.tsv", test_file="test.tsv",
label_file=None, label_file=None,
label_list=["0", "1"], label_list=["0", "1"],
) tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None): def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file.""" """Reads a tab separated value file."""
...@@ -63,6 +64,13 @@ class ChnSentiCorp(BaseNLPDataset): ...@@ -63,6 +64,13 @@ class ChnSentiCorp(BaseNLPDataset):
if __name__ == "__main__": if __name__ == "__main__":
ds = ChnSentiCorp() from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
for e in ds.get_train_examples()[:10]: tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = ChnSentiCorp(tokenizer=tokenizer, max_seq_len=10)
print("first 10 dev examples")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
...@@ -20,7 +20,7 @@ import os ...@@ -20,7 +20,7 @@ import os
from paddlehub.reader import tokenization from paddlehub.reader import tokenization
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset from paddlehub.dataset.base_nlp_dataset import MRCDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/cmrc2018.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/cmrc2018.tar.gz"
SPIECE_UNDERLINE = '▁' SPIECE_UNDERLINE = '▁'
...@@ -62,10 +62,14 @@ class CMRC2018Example(object): ...@@ -62,10 +62,14 @@ class CMRC2018Example(object):
return s return s
class CMRC2018(BaseNLPDataset): class CMRC2018(MRCDataset):
"""A single set of features of data.""" """A single set of features of data."""
def __init__(self): def __init__(self,
tokenizer=None,
max_seq_len=None,
max_query_len=64,
doc_stride=128):
dataset_dir = os.path.join(DATA_HOME, "cmrc2018") dataset_dir = os.path.join(DATA_HOME, "cmrc2018")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL) base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(CMRC2018, self).__init__( super(CMRC2018, self).__init__(
...@@ -75,6 +79,10 @@ class CMRC2018(BaseNLPDataset): ...@@ -75,6 +79,10 @@ class CMRC2018(BaseNLPDataset):
test_file=None, test_file=None,
label_file=None, label_file=None,
label_list=None, label_list=None,
tokenizer=tokenizer,
max_seq_len=max_seq_len,
max_query_len=max_query_len,
doc_stride=doc_stride,
) )
def _read_file(self, input_file, phase=False): def _read_file(self, input_file, phase=False):
...@@ -201,7 +209,9 @@ class CMRC2018(BaseNLPDataset): ...@@ -201,7 +209,9 @@ class CMRC2018(BaseNLPDataset):
if __name__ == "__main__": if __name__ == "__main__":
print("begin") print("begin")
ds = CMRC2018() from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = CMRC2018(tokenizer=tokenizer, max_seq_len=50)
print("train") print("train")
examples = ds.get_train_examples() examples = ds.get_train_examples()
for index, e in enumerate(examples): for index, e in enumerate(examples):
......
...@@ -121,6 +121,20 @@ class BaseDataset(object): ...@@ -121,6 +121,20 @@ class BaseDataset(object):
def get_predict_examples(self): def get_predict_examples(self):
return self.predict_examples return self.predict_examples
def get_examples(self, phase):
if phase == "train":
return self.get_train_examples()
elif phase == "dev":
return self.get_dev_examples()
elif phase == "test":
return self.get_test_examples()
elif phase == "val":
return self.get_val_examples()
elif phase == "predict":
return self.get_predict_examples()
else:
raise ValueError("Invalid phase: %s" % phase)
def get_labels(self): def get_labels(self):
return self.label_list return self.label_list
......
...@@ -20,7 +20,7 @@ import os ...@@ -20,7 +20,7 @@ import os
from paddlehub.reader import tokenization from paddlehub.reader import tokenization
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset from paddlehub.dataset.base_nlp_dataset import MRCDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/drcd.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/drcd.tar.gz"
SPIECE_UNDERLINE = '▁' SPIECE_UNDERLINE = '▁'
...@@ -62,10 +62,16 @@ class DRCDExample(object): ...@@ -62,10 +62,16 @@ class DRCDExample(object):
return s return s
class DRCD(BaseNLPDataset): class DRCD(MRCDataset):
"""A single set of features of data.""" """A single set of features of data."""
def __init__(self): def __init__(
self,
tokenizer=None,
max_seq_len=None,
max_query_len=64,
doc_stride=128,
):
dataset_dir = os.path.join(DATA_HOME, "drcd") dataset_dir = os.path.join(DATA_HOME, "drcd")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL) base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(DRCD, self).__init__( super(DRCD, self).__init__(
...@@ -75,6 +81,10 @@ class DRCD(BaseNLPDataset): ...@@ -75,6 +81,10 @@ class DRCD(BaseNLPDataset):
test_file="DRCD_test.json", test_file="DRCD_test.json",
label_file=None, label_file=None,
label_list=None, label_list=None,
tokenizer=tokenizer,
max_seq_len=max_seq_len,
max_query_len=max_query_len,
doc_stride=doc_stride,
) )
def _read_file(self, input_file, phase=None): def _read_file(self, input_file, phase=None):
...@@ -176,8 +186,8 @@ class DRCD(BaseNLPDataset): ...@@ -176,8 +186,8 @@ class DRCD(BaseNLPDataset):
cleaned_answer_text = "".join( cleaned_answer_text = "".join(
tokenization.whitespace_tokenize(orig_answer_text)) tokenization.whitespace_tokenize(orig_answer_text))
if actual_text.find(cleaned_answer_text) == -1: if actual_text.find(cleaned_answer_text) == -1:
logger.warning((actual_text, " vs ", logger.warning("Could not find answer: '%s' vs. '%s'" %
cleaned_answer_text, " in ", qa)) (actual_text, cleaned_answer_text))
continue continue
example = DRCDExample( example = DRCDExample(
qas_id=qas_id, qas_id=qas_id,
...@@ -191,7 +201,9 @@ class DRCD(BaseNLPDataset): ...@@ -191,7 +201,9 @@ class DRCD(BaseNLPDataset):
if __name__ == "__main__": if __name__ == "__main__":
ds = DRCD() from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = DRCD(tokenizer=tokenizer, max_seq_len=50)
print("train") print("train")
examples = ds.get_train_examples() examples = ds.get_train_examples()
for index, e in enumerate(examples): for index, e in enumerate(examples):
......
...@@ -36,7 +36,7 @@ class GLUE(BaseNLPDataset): ...@@ -36,7 +36,7 @@ class GLUE(BaseNLPDataset):
for more information for more information
""" """
def __init__(self, sub_dataset='SST-2'): def __init__(self, sub_dataset='SST-2', tokenizer=None, max_seq_len=None):
# sub_dataset : CoLA, MNLI, MRPC, QNLI, QQP, RTE, SST-2, STS-B # sub_dataset : CoLA, MNLI, MRPC, QNLI, QQP, RTE, SST-2, STS-B
if sub_dataset not in [ if sub_dataset not in [
'CoLA', 'MNLI', 'MNLI_m', 'MNLI_mm', 'MRPC', 'QNLI', 'QQP', 'CoLA', 'MNLI', 'MNLI_m', 'MNLI_mm', 'MRPC', 'QNLI', 'QQP',
...@@ -85,7 +85,8 @@ class GLUE(BaseNLPDataset): ...@@ -85,7 +85,8 @@ class GLUE(BaseNLPDataset):
predict_file=predict_file, predict_file=predict_file,
label_file=None, label_file=None,
label_list=label_list, label_list=label_list,
) tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None): def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file.""" """Reads a tab separated value file."""
...@@ -165,11 +166,13 @@ class GLUE(BaseNLPDataset): ...@@ -165,11 +166,13 @@ class GLUE(BaseNLPDataset):
if __name__ == "__main__": if __name__ == "__main__":
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
for sub_dataset in [ for sub_dataset in [
'CoLA', 'MNLI', 'MRPC', 'QNLI', 'QQP', 'RTE', 'SST-2', 'STS-B' 'CoLA', 'MNLI', 'MRPC', 'QNLI', 'QQP', 'RTE', 'SST-2', 'STS-B'
]: ]:
print(sub_dataset) print(sub_dataset)
ds = GLUE(sub_dataset=sub_dataset) ds = GLUE(sub_dataset=sub_dataset, tokenizer=tokenizer, max_seq_len=10)
for e in ds.get_train_examples()[:2]: for e in ds.get_train_examples()[:2]:
print(e) print(e)
print() print()
...@@ -182,3 +185,6 @@ if __name__ == "__main__": ...@@ -182,3 +185,6 @@ if __name__ == "__main__":
for e in ds.get_predict_examples()[:2]: for e in ds.get_predict_examples()[:2]:
print(e) print(e)
print() print()
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
...@@ -22,13 +22,13 @@ import os ...@@ -22,13 +22,13 @@ import os
from paddlehub.dataset import InputExample from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/iflytek.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/iflytek.tar.gz"
class IFLYTEK(BaseNLPDataset): class IFLYTEK(TextClassificationDataset):
def __init__(self): def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "iflytek") dataset_dir = os.path.join(DATA_HOME, "iflytek")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL) base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(IFLYTEK, self).__init__( super(IFLYTEK, self).__init__(
...@@ -38,7 +38,8 @@ class IFLYTEK(BaseNLPDataset): ...@@ -38,7 +38,8 @@ class IFLYTEK(BaseNLPDataset):
test_file="test.txt", test_file="test.txt",
label_file=None, label_file=None,
label_list=[str(i) for i in range(119)], label_list=[str(i) for i in range(119)],
) tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None): def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file.""" """Reads a tab separated value file."""
...@@ -56,7 +57,9 @@ class IFLYTEK(BaseNLPDataset): ...@@ -56,7 +57,9 @@ class IFLYTEK(BaseNLPDataset):
if __name__ == "__main__": if __name__ == "__main__":
ds = IFLYTEK() from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = IFLYTEK(tokenizer=tokenizer, max_seq_len=10)
print("first 10 dev") print("first 10 dev")
for e in ds.get_dev_examples()[:10]: for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
...@@ -67,3 +70,6 @@ if __name__ == "__main__": ...@@ -67,3 +70,6 @@ if __name__ == "__main__":
for e in ds.get_test_examples()[:10]: for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds) print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
...@@ -23,17 +23,17 @@ import csv ...@@ -23,17 +23,17 @@ import csv
from paddlehub.dataset import InputExample from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/inews.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/inews.tar.gz"
class INews(BaseNLPDataset): class INews(TextClassificationDataset):
""" """
INews is a sentiment analysis dataset for Internet News INews is a sentiment analysis dataset for Internet News
""" """
def __init__(self): def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "inews") dataset_dir = os.path.join(DATA_HOME, "inews")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL) base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(INews, self).__init__( super(INews, self).__init__(
...@@ -43,7 +43,8 @@ class INews(BaseNLPDataset): ...@@ -43,7 +43,8 @@ class INews(BaseNLPDataset):
test_file="test.txt", test_file="test.txt",
label_file=None, label_file=None,
label_list=["0", "1", "2"], label_list=["0", "1", "2"],
) tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None): def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file.""" """Reads a tab separated value file."""
...@@ -60,7 +61,10 @@ class INews(BaseNLPDataset): ...@@ -60,7 +61,10 @@ class INews(BaseNLPDataset):
if __name__ == "__main__": if __name__ == "__main__":
ds = INews() from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = INews(tokenizer=tokenizer, max_seq_len=10)
print("first 10 dev") print("first 10 dev")
for e in ds.get_dev_examples()[:10]: for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
...@@ -71,3 +75,6 @@ if __name__ == "__main__": ...@@ -71,3 +75,6 @@ if __name__ == "__main__":
for e in ds.get_test_examples()[:10]: for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds) print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
...@@ -23,13 +23,13 @@ import csv ...@@ -23,13 +23,13 @@ import csv
from paddlehub.dataset import InputExample from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/lcqmc.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/lcqmc.tar.gz"
class LCQMC(BaseNLPDataset): class LCQMC(TextClassificationDataset):
def __init__(self): def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "lcqmc") dataset_dir = os.path.join(DATA_HOME, "lcqmc")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL) base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(LCQMC, self).__init__( super(LCQMC, self).__init__(
...@@ -39,7 +39,8 @@ class LCQMC(BaseNLPDataset): ...@@ -39,7 +39,8 @@ class LCQMC(BaseNLPDataset):
test_file="test.tsv", test_file="test.tsv",
label_file=None, label_file=None,
label_list=["0", "1"], label_list=["0", "1"],
) tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None): def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file.""" """Reads a tab separated value file."""
...@@ -58,7 +59,10 @@ class LCQMC(BaseNLPDataset): ...@@ -58,7 +59,10 @@ class LCQMC(BaseNLPDataset):
if __name__ == "__main__": if __name__ == "__main__":
ds = LCQMC() from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = LCQMC(tokenizer=tokenizer, max_seq_len=512)
print("first 10 dev") print("first 10 dev")
for e in ds.get_dev_examples()[:10]: for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
...@@ -69,3 +73,7 @@ if __name__ == "__main__": ...@@ -69,3 +73,7 @@ if __name__ == "__main__":
for e in ds.get_test_examples()[:10]: for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds) print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
...@@ -23,12 +23,12 @@ import csv ...@@ -23,12 +23,12 @@ import csv
from paddlehub.dataset import InputExample from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset from paddlehub.dataset.base_nlp_dataset import SeqLabelingDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/msra_ner.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/msra_ner.tar.gz"
class MSRA_NER(BaseNLPDataset): class MSRA_NER(SeqLabelingDataset):
""" """
A set of manually annotated Chinese word-segmentation data and A set of manually annotated Chinese word-segmentation data and
specifications for training and testing a Chinese word-segmentation system specifications for training and testing a Chinese word-segmentation system
...@@ -36,7 +36,7 @@ class MSRA_NER(BaseNLPDataset): ...@@ -36,7 +36,7 @@ class MSRA_NER(BaseNLPDataset):
https://www.microsoft.com/en-us/download/details.aspx?id=52531 https://www.microsoft.com/en-us/download/details.aspx?id=52531
""" """
def __init__(self): def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "msra_ner") dataset_dir = os.path.join(DATA_HOME, "msra_ner")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL) base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(MSRA_NER, self).__init__( super(MSRA_NER, self).__init__(
...@@ -48,7 +48,8 @@ class MSRA_NER(BaseNLPDataset): ...@@ -48,7 +48,8 @@ class MSRA_NER(BaseNLPDataset):
label_list=[ label_list=[
"B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "O" "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "O"
], ],
) tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None): def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file.""" """Reads a tab separated value file."""
...@@ -67,7 +68,9 @@ class MSRA_NER(BaseNLPDataset): ...@@ -67,7 +68,9 @@ class MSRA_NER(BaseNLPDataset):
if __name__ == "__main__": if __name__ == "__main__":
ds = MSRA_NER() from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = MSRA_NER(tokenizer=tokenizer, max_seq_len=30)
print("first 10 dev") print("first 10 dev")
for e in ds.get_dev_examples()[:10]: for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
...@@ -78,3 +81,6 @@ if __name__ == "__main__": ...@@ -78,3 +81,6 @@ if __name__ == "__main__":
for e in ds.get_test_examples()[:10]: for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds) print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
...@@ -23,19 +23,19 @@ import csv ...@@ -23,19 +23,19 @@ import csv
from paddlehub.dataset import InputExample from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/nlpcc-dbqa.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/nlpcc-dbqa.tar.gz"
class NLPCC_DBQA(BaseNLPDataset): class NLPCC_DBQA(TextClassificationDataset):
""" """
Please refer to Please refer to
http://tcci.ccf.org.cn/conference/2017/dldoc/taskgline05.pdf http://tcci.ccf.org.cn/conference/2017/dldoc/taskgline05.pdf
for more information for more information
""" """
def __init__(self): def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "nlpcc-dbqa") dataset_dir = os.path.join(DATA_HOME, "nlpcc-dbqa")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL) base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(NLPCC_DBQA, self).__init__( super(NLPCC_DBQA, self).__init__(
...@@ -45,7 +45,8 @@ class NLPCC_DBQA(BaseNLPDataset): ...@@ -45,7 +45,8 @@ class NLPCC_DBQA(BaseNLPDataset):
test_file="test.tsv", test_file="test.tsv",
label_file=None, label_file=None,
label_list=["0", "1"], label_list=["0", "1"],
) tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None): def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file.""" """Reads a tab separated value file."""
...@@ -64,7 +65,9 @@ class NLPCC_DBQA(BaseNLPDataset): ...@@ -64,7 +65,9 @@ class NLPCC_DBQA(BaseNLPDataset):
if __name__ == "__main__": if __name__ == "__main__":
ds = NLPCC_DBQA() from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = NLPCC_DBQA(tokenizer=tokenizer, max_seq_len=10)
print("first 10 dev") print("first 10 dev")
for e in ds.get_dev_examples()[:10]: for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
...@@ -75,3 +78,6 @@ if __name__ == "__main__": ...@@ -75,3 +78,6 @@ if __name__ == "__main__":
for e in ds.get_test_examples()[:10]: for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds) print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
...@@ -20,7 +20,7 @@ import os ...@@ -20,7 +20,7 @@ import os
from paddlehub.reader import tokenization from paddlehub.reader import tokenization
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset from paddlehub.dataset.base_nlp_dataset import MRCDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/squad.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/squad.tar.gz"
...@@ -65,10 +65,17 @@ class SquadExample(object): ...@@ -65,10 +65,17 @@ class SquadExample(object):
return s return s
class SQUAD(BaseNLPDataset): class SQUAD(MRCDataset):
"""A single set of features of data.""" """A single set of features of data."""
def __init__(self, version_2_with_negative=False): def __init__(
self,
version_2_with_negative=False,
tokenizer=None,
max_seq_len=None,
max_query_len=64,
doc_stride=128,
):
self.version_2_with_negative = version_2_with_negative self.version_2_with_negative = version_2_with_negative
if not version_2_with_negative: if not version_2_with_negative:
train_file = "train-v1.1.json" train_file = "train-v1.1.json"
...@@ -87,6 +94,10 @@ class SQUAD(BaseNLPDataset): ...@@ -87,6 +94,10 @@ class SQUAD(BaseNLPDataset):
test_file=None, test_file=None,
label_file=None, label_file=None,
label_list=None, label_list=None,
tokenizer=tokenizer,
max_seq_len=max_seq_len,
max_query_len=max_query_len,
doc_stride=doc_stride,
) )
def _read_file(self, input_file, phase=None): def _read_file(self, input_file, phase=None):
...@@ -177,7 +188,10 @@ class SQUAD(BaseNLPDataset): ...@@ -177,7 +188,10 @@ class SQUAD(BaseNLPDataset):
if __name__ == "__main__": if __name__ == "__main__":
ds = SQUAD(version_2_with_negative=True) from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = SQUAD(
version_2_with_negative=True, tokenizer=tokenizer, max_seq_len=512)
print("first 10 dev") print("first 10 dev")
for e in ds.get_dev_examples()[:2]: for e in ds.get_dev_examples()[:2]:
print(e) print(e)
......
...@@ -22,13 +22,13 @@ import os ...@@ -22,13 +22,13 @@ import os
from paddlehub.dataset import InputExample from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/thucnews.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/thucnews.tar.gz"
class THUCNEWS(BaseNLPDataset): class THUCNEWS(TextClassificationDataset):
def __init__(self): def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "thucnews") dataset_dir = os.path.join(DATA_HOME, "thucnews")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL) base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(THUCNEWS, self).__init__( super(THUCNEWS, self).__init__(
...@@ -38,7 +38,8 @@ class THUCNEWS(BaseNLPDataset): ...@@ -38,7 +38,8 @@ class THUCNEWS(BaseNLPDataset):
test_file="test.txt", test_file="test.txt",
label_file=None, label_file=None,
label_list=[str(i) for i in range(14)], label_list=[str(i) for i in range(14)],
) tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None): def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file.""" """Reads a tab separated value file."""
...@@ -56,7 +57,9 @@ class THUCNEWS(BaseNLPDataset): ...@@ -56,7 +57,9 @@ class THUCNEWS(BaseNLPDataset):
if __name__ == "__main__": if __name__ == "__main__":
ds = THUCNEWS() from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = THUCNEWS(tokenizer=tokenizer, max_seq_len=10)
print("first 10 dev") print("first 10 dev")
for e in ds.get_dev_examples()[:10]: for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
...@@ -67,3 +70,6 @@ if __name__ == "__main__": ...@@ -67,3 +70,6 @@ if __name__ == "__main__":
for e in ds.get_test_examples()[:10]: for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds) print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
...@@ -20,7 +20,8 @@ from __future__ import print_function ...@@ -20,7 +20,8 @@ from __future__ import print_function
import io import io
import os import os
from paddlehub.dataset import InputExample, BaseDataset from paddlehub.dataset import InputExample
from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/tnews.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/tnews.tar.gz"
...@@ -44,12 +45,12 @@ LABEL_NAME = { ...@@ -44,12 +45,12 @@ LABEL_NAME = {
} }
class TNews(BaseDataset): class TNews(TextClassificationDataset):
""" """
TNews is the chinese news classification dataset on Jinri Toutiao App. TNews is the chinese news classification dataset on Jinri Toutiao App.
""" """
def __init__(self): def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "tnews") dataset_dir = os.path.join(DATA_HOME, "tnews")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL) base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
label_list = [ label_list = [
...@@ -63,7 +64,8 @@ class TNews(BaseDataset): ...@@ -63,7 +64,8 @@ class TNews(BaseDataset):
test_file="toutiao_category_test.txt", test_file="toutiao_category_test.txt",
label_file=None, label_file=None,
label_list=label_list, label_list=label_list,
) tokenizer=tokenizer,
max_seq_len=max_seq_len)
def get_label_name(self, id): def get_label_name(self, id):
return LABEL_NAME[id] return LABEL_NAME[id]
...@@ -82,7 +84,9 @@ class TNews(BaseDataset): ...@@ -82,7 +84,9 @@ class TNews(BaseDataset):
if __name__ == "__main__": if __name__ == "__main__":
ds = TNews() from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = TNews(tokenizer=tokenizer, max_seq_len=10)
print("first 10 dev") print("first 10 dev")
for e in ds.get_dev_examples()[:10]: for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
...@@ -93,3 +97,6 @@ if __name__ == "__main__": ...@@ -93,3 +97,6 @@ if __name__ == "__main__":
for e in ds.get_test_examples()[:10]: for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds) print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
...@@ -22,18 +22,18 @@ import pandas as pd ...@@ -22,18 +22,18 @@ import pandas as pd
from paddlehub.dataset import InputExample from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset from paddlehub.dataset.base_nlp_dataset import MultiLabelDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/toxic.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/toxic.tar.gz"
class Toxic(BaseNLPDataset): class Toxic(MultiLabelDataset):
""" """
The kaggle Toxic dataset: The kaggle Toxic dataset:
https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge
""" """
def __init__(self): def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "toxic") dataset_dir = os.path.join(DATA_HOME, "toxic")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL) base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
label_list = [ label_list = [
...@@ -47,7 +47,8 @@ class Toxic(BaseNLPDataset): ...@@ -47,7 +47,8 @@ class Toxic(BaseNLPDataset):
test_file="test.csv", test_file="test.csv",
label_file=None, label_file=None,
label_list=label_list, label_list=label_list,
) tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None): def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file.""" """Reads a tab separated value file."""
...@@ -64,7 +65,10 @@ class Toxic(BaseNLPDataset): ...@@ -64,7 +65,10 @@ class Toxic(BaseNLPDataset):
if __name__ == "__main__": if __name__ == "__main__":
ds = Toxic() from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = Toxic(tokenizer=tokenizer, max_seq_len=10)
print("first 10 dev") print("first 10 dev")
for e in ds.get_dev_examples()[:10]: for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
...@@ -75,3 +79,6 @@ if __name__ == "__main__": ...@@ -75,3 +79,6 @@ if __name__ == "__main__":
for e in ds.get_test_examples()[:10]: for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds) print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
...@@ -25,19 +25,19 @@ import csv ...@@ -25,19 +25,19 @@ import csv
from paddlehub.dataset import InputExample from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/XNLI-lan.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/XNLI-lan.tar.gz"
class XNLI(BaseNLPDataset): class XNLI(TextClassificationDataset):
""" """
Please refer to Please refer to
https://arxiv.org/pdf/1809.05053.pdf https://arxiv.org/pdf/1809.05053.pdf
for more information for more information
""" """
def __init__(self, language='zh'): def __init__(self, language='zh', tokenizer=None, max_seq_len=None):
if language not in [ if language not in [
"ar", "bg", "de", "el", "en", "es", "fr", "hi", "ru", "sw", "ar", "bg", "de", "el", "en", "es", "fr", "hi", "ru", "sw",
"th", "tr", "ur", "vi", "zh" "th", "tr", "ur", "vi", "zh"
...@@ -55,6 +55,8 @@ class XNLI(BaseNLPDataset): ...@@ -55,6 +55,8 @@ class XNLI(BaseNLPDataset):
test_file="%s_test.tsv" % language, test_file="%s_test.tsv" % language,
label_file=None, label_file=None,
label_list=["neutral", "contradiction", "entailment"], label_list=["neutral", "contradiction", "entailment"],
tokenizer=tokenizer,
max_seq_len=max_seq_len,
) )
def _read_file(self, input_file, phase=None): def _read_file(self, input_file, phase=None):
...@@ -74,7 +76,10 @@ class XNLI(BaseNLPDataset): ...@@ -74,7 +76,10 @@ class XNLI(BaseNLPDataset):
if __name__ == "__main__": if __name__ == "__main__":
ds = XNLI() from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = XNLI(tokenizer=tokenizer, max_seq_len=20)
print("first 10 dev") print("first 10 dev")
for e in ds.get_dev_examples()[:10]: for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
......
...@@ -167,7 +167,7 @@ class DefaultStrategy(object): ...@@ -167,7 +167,7 @@ class DefaultStrategy(object):
self.optimizer = fluid.optimizer.Adam( self.optimizer = fluid.optimizer.Adam(
learning_rate=self.learning_rate, **kwargs) learning_rate=self.learning_rate, **kwargs)
def execute(self, loss, data_reader, config, dev_count): def execute(self, loss, max_train_steps):
if self.optimizer is not None: if self.optimizer is not None:
self.optimizer.minimize(loss) self.optimizer.minimize(loss)
else: else:
...@@ -456,26 +456,9 @@ class CombinedStrategy(DefaultStrategy): ...@@ -456,26 +456,9 @@ class CombinedStrategy(DefaultStrategy):
"weight_decay"] * scheduled_lr "weight_decay"] * scheduled_lr
fluid.layers.assign(output=param, input=updated_param) fluid.layers.assign(output=param, input=updated_param)
def execute(self, loss, data_reader, config, dev_count): def execute(self, loss, max_train_steps):
# base information # base information
self.main_program = loss.block.program self.main_program = loss.block.program
self.config = config
# self.num_examples = {'train': -1, 'dev': -1, 'test': -1} before data_generator
data_reader.data_generator(
batch_size=config.batch_size, phase='train', shuffle=True)
num_train_examples = data_reader.num_examples['train']
max_train_steps = config.num_epoch * num_train_examples // config.batch_size // dev_count
try:
# nlp_reader
_in_tokens = data_reader.in_tokens
if _in_tokens:
max_train_steps *= data_reader.max_seq_len
except:
# cv_reader without .in_tokens and .max_seq_len
pass
if self.scheduler["discriminative"]["blocks"] > 0 or self.scheduler[ if self.scheduler["discriminative"]["blocks"] > 0 or self.scheduler[
"gradual_unfreeze"]["blocks"] > 0: "gradual_unfreeze"]["blocks"] > 0:
...@@ -494,8 +477,7 @@ class CombinedStrategy(DefaultStrategy): ...@@ -494,8 +477,7 @@ class CombinedStrategy(DefaultStrategy):
self.regularization_handler(loss, scheduled_lr) self.regularization_handler(loss, scheduled_lr)
logger.info(self.__str__()) logger.info(self.__str__())
return scheduled_lr
return scheduled_lr, max_train_steps
def exclude_from_weight_decay(self, name): def exclude_from_weight_decay(self, name):
if name.find("layer_norm") > -1: if name.find("layer_norm") > -1:
......
...@@ -35,6 +35,7 @@ import paddle.fluid as fluid ...@@ -35,6 +35,7 @@ import paddle.fluid as fluid
from visualdl import LogWriter from visualdl import LogWriter
import paddlehub as hub import paddlehub as hub
from paddlehub.reader.nlp_reader import BaseNLPReader
from paddlehub.common.paddle_helper import dtype_map, clone_program from paddlehub.common.paddle_helper import dtype_map, clone_program
from paddlehub.common.utils import mkdir from paddlehub.common.utils import mkdir
from paddlehub.common.dir import tmp_dir from paddlehub.common.dir import tmp_dir
...@@ -84,7 +85,7 @@ class RunEnv(object): ...@@ -84,7 +85,7 @@ class RunEnv(object):
self.start_program = None self.start_program = None
self.main_program_compiled = None self.main_program_compiled = None
self.py_reader = None self.py_reader = None
self.reader = None self.generator = None
self.loss = None self.loss = None
self.labels = None self.labels = None
self.metrics = None self.metrics = None
...@@ -260,8 +261,8 @@ class BaseTask(object): ...@@ -260,8 +261,8 @@ class BaseTask(object):
BaseTask is the base class of all the task. It will complete the building of all the running environment. BaseTask is the base class of all the task. It will complete the building of all the running environment.
Args: Args:
feed_list (list): the inputs name feed_list (list): the inputs name. Deprecated in paddlehub v1.8.
data_reader (object): data reader for the task data_reader (object): data reader for the task. Deprecated in paddlehub v1.8.
main_program (object): the customized main_program, default None main_program (object): the customized main_program, default None
startup_program (object): the customized startup_program, default None startup_program (object): the customized startup_program, default None
config (object): the config for the task, default None config (object): the config for the task, default None
...@@ -269,16 +270,13 @@ class BaseTask(object): ...@@ -269,16 +270,13 @@ class BaseTask(object):
""" """
def __init__(self, def __init__(self,
feed_list, dataset=None,
data_reader, feed_list=None,
data_reader=None,
main_program=None, main_program=None,
startup_program=None, startup_program=None,
config=None, config=None,
metrics_choices="default"): metrics_choices="default"):
# base item
self._base_data_reader = data_reader
self._base_feed_list = feed_list
# metrics item # metrics item
self.best_score = -999 self.best_score = -999
if metrics_choices == "default": if metrics_choices == "default":
...@@ -293,7 +291,6 @@ class BaseTask(object): ...@@ -293,7 +291,6 @@ class BaseTask(object):
if main_program is None: if main_program is None:
self._base_main_program = clone_program( self._base_main_program = clone_program(
fluid.default_main_program(), for_test=False) fluid.default_main_program(), for_test=False)
else: else:
self._base_main_program = clone_program( self._base_main_program = clone_program(
main_program, for_test=False) main_program, for_test=False)
...@@ -344,6 +341,23 @@ class BaseTask(object): ...@@ -344,6 +341,23 @@ class BaseTask(object):
# set default phase # set default phase
self.enter_phase("train") self.enter_phase("train")
self.dataset = dataset
if dataset:
self._label_list = dataset.get_labels()
# Compatible code for usage deprecated in paddlehub v1.8.
self._base_data_reader = data_reader
self._base_feed_list = feed_list
if isinstance(data_reader, BaseNLPReader):
self._compatible_mode = True
logger.warning(
"PaddleHub v1.8 has deprecated the reader and feed_list parameters in the nlp Task. We provided an easier usage, "
"in which you can use your tokenizer to preprocess dataset and run task in a clear flow. "
"New demo see https://github.com/PaddlePaddle/PaddleHub/blob/release/v1.8/demo/text_classification/text_cls.py"
)
else:
self._compatible_mode = False
@contextlib.contextmanager @contextlib.contextmanager
def phase_guard(self, phase): def phase_guard(self, phase):
self.enter_phase(phase) self.enter_phase(phase)
...@@ -420,9 +434,29 @@ class BaseTask(object): ...@@ -420,9 +434,29 @@ class BaseTask(object):
with fluid.program_guard(self.env.main_program, with fluid.program_guard(self.env.main_program,
self._base_startup_program): self._base_startup_program):
with fluid.unique_name.guard(self.env.UNG): with fluid.unique_name.guard(self.env.UNG):
self.scheduled_lr, self.max_train_steps = self.config.strategy.execute( if self._compatible_mode:
self.loss, self._base_data_reader, self.config, # This branch is compatible code for usage deprecated in paddlehub v1.8.
self.device_count) self._base_data_reader.data_generator(
batch_size=self.config.batch_size,
phase='train',
shuffle=True)
num_train_examples = self._base_data_reader.num_examples[
'train']
try:
# nlp_reader
_in_tokens = self._base_data_reader.in_tokens
if _in_tokens:
num_train_examples *= self._base_data_reader.max_seq_len
except:
# cv_reader without .in_tokens and .max_seq_len
pass
else:
num_train_examples = len(
self.dataset.get_train_records())
self.max_train_steps = self.config.num_epoch * num_train_examples // self.config.batch_size // self.device_count
self.scheduled_lr = self.config.strategy.execute(
self.loss, self.max_train_steps)
if self.is_train_phase: if self.is_train_phase:
loss_name = self.env.loss.name loss_name = self.env.loss.name
...@@ -529,17 +563,40 @@ class BaseTask(object): ...@@ -529,17 +563,40 @@ class BaseTask(object):
return self.main_program return self.main_program
@property @property
def reader(self): def generator(self):
if self.is_predict_phase: if self._compatible_mode:
data = self._predict_data if self.is_predict_phase:
data = self._predict_data
else:
data = None
self.env.generator = self._base_data_reader.data_generator(
batch_size=self.config.batch_size,
phase=self.phase,
data=data,
return_list=not self.config.use_pyreader)
else: else:
data = None
self.env.reader = self._base_data_reader.data_generator( def data_generator(records):
batch_size=self.config.batch_size, def wrapper():
phase=self.phase, for record in records:
data=data, values = []
return_list=not self.config.use_pyreader) for feed_name in self.feed_list:
return self.env.reader values.append(record[feed_name])
yield values
return wrapper
if self.is_predict_phase:
records = self._predict_data
else:
if self.is_train_phase:
shuffle = True
else:
shuffle = False
records = self.dataset.get_records(
phase=self.phase, shuffle=shuffle)
self.env.generator = data_generator(records)
return self.env.generator
@property @property
def loss(self): def loss(self):
...@@ -580,13 +637,30 @@ class BaseTask(object): ...@@ -580,13 +637,30 @@ class BaseTask(object):
@property @property
def feed_list(self): def feed_list(self):
feed_list = [varname for varname in self._base_feed_list] if self._compatible_mode:
if self.is_train_phase or self.is_test_phase: feed_list = [varname for varname in self._base_feed_list]
feed_list += [label.name for label in self.labels] if self.is_train_phase or self.is_test_phase:
feed_list += [label.name for label in self.labels]
else:
if not self.env.is_inititalized:
self._build_env()
if self._predict_data:
feed_list = list(self._predict_data[0].keys())
else:
feed_list = self.dataset.get_feed_list(self.phase)
feed_list = [
feed_name for feed_name in feed_list
if feed_name in self.main_program.global_block().vars
]
return feed_list return feed_list
@property @property
def feed_var_list(self): def feed_var_list(self):
if not self.env.is_inititalized:
self._build_env()
vars = self.main_program.global_block().vars vars = self.main_program.global_block().vars
return [vars[varname] for varname in self.feed_list] return [vars[varname] for varname in self.feed_list]
...@@ -890,13 +964,20 @@ class BaseTask(object): ...@@ -890,13 +964,20 @@ class BaseTask(object):
self.env.current_epoch += 1 self.env.current_epoch += 1
# Final evaluation # Final evaluation
if self._base_data_reader.get_dev_examples() != []: if self._compatible_mode:
dev_examples = self._base_data_reader.get_dev_examples()
test_examples = self._base_data_reader.get_test_examples()
else:
dev_examples = self.dataset.get_dev_examples()
test_examples = self.dataset.get_test_examples()
if dev_examples != []:
# Warning: DO NOT use self.eval(phase="dev", load_best_model=True) during training. # Warning: DO NOT use self.eval(phase="dev", load_best_model=True) during training.
# It will cause trainer unable to continue training from checkpoint after eval. # It will cause trainer unable to continue training from checkpoint after eval.
# More important, The model should evaluate current performance during training. # More important, The model should evaluate current performance during training.
self.eval(phase="dev") self.eval(phase="dev")
if self._base_data_reader.get_test_examples() != []: if test_examples != []:
self.eval(phase="test", load_best_model=True) self.eval(phase="test", load_best_model=True)
# Save checkpoint after finetune # Save checkpoint after finetune
self.save_checkpoint() self.save_checkpoint()
...@@ -957,17 +1038,41 @@ class BaseTask(object): ...@@ -957,17 +1038,41 @@ class BaseTask(object):
global_run_states = [] global_run_states = []
period_run_states = [] period_run_states = []
for run_step, batch in enumerate(self.reader(), start=1): feed_var_shape = []
feed_var_type = []
for var in self.feed_var_list:
feed_var_shape.append(var.shape)
feed_var_type.append(dtype_map[var.dtype])
if self._compatible_mode:
data_reader = self.generator
else:
data_reader = paddle.batch(
self.generator, batch_size=self.config.batch_size)
for batch in data_reader():
if self._compatible_mode and not self.config.use_pyreader:
# if not use pyreader, the nlp_reader return [batch]
batch = batch[0]
step_run_state = RunState(len(self.fetch_list)) step_run_state = RunState(len(self.fetch_list))
step_run_state.run_step = 1 step_run_state.run_step = 1
num_batch_examples = len(batch) num_batch_examples = len(batch)
if not self.config.use_pyreader: # Preocessing data to the suitable shape and type for the model
# if use pyreader, the nlp_reader return [batch] processed_batch = [[] for i in range(len(self.feed_list))]
batch = batch[0] if self._compatible_mode:
processed_batch = batch
batch = [fluid.core.PaddleTensor(data) for data in batch] else:
fetch_result = self._predictor.run(batch) for sample in batch:
for i, data in enumerate(sample):
processed_batch[i].append(data)
tensor_batch = [[] for i in range(len(self.feed_list))]
for i in range(len(processed_batch)):
processed_batch[i] = np.array(processed_batch[i]).reshape(
feed_var_shape[i]).astype(feed_var_type[i])
tensor_batch[i] = fluid.core.PaddleTensor(processed_batch[i])
fetch_result = self._predictor.run(tensor_batch)
for index, result in enumerate(fetch_result): for index, result in enumerate(fetch_result):
step_run_state.run_results[index] = result.as_ndarray() step_run_state.run_results[index] = result.as_ndarray()
step_run_state.run_examples += num_batch_examples step_run_state.run_examples += num_batch_examples
...@@ -978,18 +1083,23 @@ class BaseTask(object): ...@@ -978,18 +1083,23 @@ class BaseTask(object):
global_run_states += period_run_states global_run_states += period_run_states
return global_run_states return global_run_states
def predict(self, def predict(
data, self,
load_best_model=True, data=None,
return_result=False, label_list=None,
accelerate_mode=True): load_best_model=True,
return_result=False,
accelerate_mode=True,
):
""" """
make prediction for the input data. make prediction for the input data.
Args: Args:
data (list): the data will be predicted. data (list): the data will be predicted. Its element should be a record when the task is initialized without data_reader param,
or a plaintext string list when the task is initialized with data_reader param (deprecated in paddlehub v1.8).
label_list (list): the label list, used to proprocess the output.
load_best_model (bool): load the best model or not load_best_model (bool): load the best model or not
return_result (bool): return a readable result or just the raw run result return_result (bool): return a readable result or just the raw run result. Always True when the task is not initialized with data_reader param.
accelerate_mode (bool): use high-performance predictor or not accelerate_mode (bool): use high-performance predictor or not
Returns: Returns:
...@@ -1005,6 +1115,7 @@ class BaseTask(object): ...@@ -1005,6 +1115,7 @@ class BaseTask(object):
with self.phase_guard(phase="predict"): with self.phase_guard(phase="predict"):
self._predict_data = data self._predict_data = data
self._label_list = label_list
self._predict_start_event() self._predict_start_event()
if load_best_model: if load_best_model:
...@@ -1020,7 +1131,7 @@ class BaseTask(object): ...@@ -1020,7 +1131,7 @@ class BaseTask(object):
self._predict_end_event(run_states) self._predict_end_event(run_states)
self._predict_data = None self._predict_data = None
if return_result: if return_result or not self._compatible_mode:
return self._postprocessing(run_states) return self._postprocessing(run_states)
return run_states return run_states
...@@ -1057,20 +1168,34 @@ class BaseTask(object): ...@@ -1057,20 +1168,34 @@ class BaseTask(object):
capacity=64, capacity=64,
use_double_buffer=True, use_double_buffer=True,
iterable=True) iterable=True)
data_reader = data_loader.set_batch_generator( if self._compatible_mode:
self.reader, places=self.places) data_reader = data_loader.set_batch_generator(
self.generator, places=self.places)
else:
data_reader = data_loader.set_sample_generator(
self.generator,
places=self.places,
batch_size=self.config.batch_size,
drop_last=True)
else: else:
data_feeder = fluid.DataFeeder( data_feeder = fluid.DataFeeder(
feed_list=self.feed_list, place=self.place) feed_list=self.feed_list, place=self.place)
data_reader = data_feeder.decorate_reader( if self._compatible_mode:
self.reader, data_reader = data_feeder.decorate_reader(
multi_devices=self.config.use_data_parallel, self.generator,
drop_last=True) multi_devices=self.config.use_data_parallel,
drop_last=True)
else:
data_reader = data_feeder.decorate_reader(
paddle.batch(
self.generator, batch_size=self.config.batch_size),
multi_devices=self.config.use_data_parallel,
drop_last=True)
global_run_states = [] global_run_states = []
period_run_states = [] period_run_states = []
for run_step, batch in enumerate(data_reader(), start=1): for batch in data_reader():
step_run_state = RunState(len(self.fetch_list)) step_run_state = RunState(len(self.fetch_list))
step_run_state.run_step = 1 step_run_state.run_step = 1
num_batch_examples = len(batch) num_batch_examples = len(batch)
...@@ -1107,6 +1232,5 @@ class BaseTask(object): ...@@ -1107,6 +1232,5 @@ class BaseTask(object):
return global_run_states return global_run_states
def __repr__(self): def __repr__(self):
return "Task: %s with metrics_choices: %s, reader: %s, %s" % ( return "Task: %s with metrics_choices: %s, %s" % (
self.__class__.__name__, self.metrics_choices, self.__class__.__name__, self.metrics_choices, self.config)
self._base_data_reader.__class__.__name__, self.config)
...@@ -19,13 +19,12 @@ from __future__ import print_function ...@@ -19,13 +19,12 @@ from __future__ import print_function
from collections import OrderedDict from collections import OrderedDict
import numpy as np import numpy as np
import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import time import time
from paddlehub.common.logger import logger from paddlehub.common.logger import logger
from paddlehub.finetune.evaluate import calculate_f1_np, matthews_corrcoef from paddlehub.finetune.evaluate import calculate_f1_np, matthews_corrcoef
from paddlehub.reader.nlp_reader import ClassifyReader from paddlehub.reader.nlp_reader import ClassifyReader, LACClassifyReader
import paddlehub.network as net import paddlehub.network as net
from .base_task import BaseTask from .base_task import BaseTask
...@@ -35,8 +34,9 @@ class ClassifierTask(BaseTask): ...@@ -35,8 +34,9 @@ class ClassifierTask(BaseTask):
def __init__(self, def __init__(self,
feature, feature,
num_classes, num_classes,
feed_list, dataset=None,
data_reader, feed_list=None,
data_reader=None,
startup_program=None, startup_program=None,
config=None, config=None,
hidden_units=None, hidden_units=None,
...@@ -46,6 +46,7 @@ class ClassifierTask(BaseTask): ...@@ -46,6 +46,7 @@ class ClassifierTask(BaseTask):
main_program = feature.block.program main_program = feature.block.program
super(ClassifierTask, self).__init__( super(ClassifierTask, self).__init__(
dataset=dataset,
data_reader=data_reader, data_reader=data_reader,
main_program=main_program, main_program=main_program,
feed_list=feed_list, feed_list=feed_list,
...@@ -109,7 +110,7 @@ class ClassifierTask(BaseTask): ...@@ -109,7 +110,7 @@ class ClassifierTask(BaseTask):
run_examples += run_state.run_examples run_examples += run_state.run_examples
run_step += run_state.run_step run_step += run_state.run_step
loss_sum += np.mean( loss_sum += np.mean(
run_state.run_results[-2]) * run_state.run_examples run_state.run_results[-1]) * run_state.run_examples
acc_sum += np.mean( acc_sum += np.mean(
run_state.run_results[2]) * run_state.run_examples run_state.run_results[2]) * run_state.run_examples
np_labels = run_state.run_results[0] np_labels = run_state.run_results[0]
...@@ -140,20 +141,28 @@ class ClassifierTask(BaseTask): ...@@ -140,20 +141,28 @@ class ClassifierTask(BaseTask):
return scores, avg_loss, run_speed return scores, avg_loss, run_speed
def _postprocessing(self, run_states): def _postprocessing(self, run_states):
try: if self._compatible_mode:
id2label = { try:
val: key label_list = list(self._base_data_reader.label_map.keys())
for key, val in self._base_data_reader.label_map.items() except:
} raise Exception(
except: "ImageClassificationDataset does not support postprocessing, please use BaseCVDataset instead"
raise Exception( )
"ImageClassificationDataset does not support postprocessing, please use BaseCVDataset instead" else:
) if self._label_list:
label_list = self._label_list
else:
logger.warning(
"Fail to postprocess the predict output. Please set label_list parameter in predict function or initialize the task with dataset parameter."
)
return run_states
results = [] results = []
for batch_state in run_states: for batch_state in run_states:
batch_result = batch_state.run_results batch_result = batch_state.run_results
batch_infer = np.argmax(batch_result[0], axis=1) batch_infer = np.argmax(batch_result[0], axis=1)
results += [id2label[sample_infer] for sample_infer in batch_infer] results += [
label_list[sample_infer] for sample_infer in batch_infer
]
return results return results
...@@ -166,22 +175,24 @@ class TextClassifierTask(ClassifierTask): ...@@ -166,22 +175,24 @@ class TextClassifierTask(ClassifierTask):
It will use full-connect layer with softmax activation function to classify texts. It will use full-connect layer with softmax activation function to classify texts.
""" """
def __init__(self, def __init__(
num_classes, self,
feed_list, num_classes,
data_reader, dataset=None,
feature=None, feed_list=None, # Deprecated
token_feature=None, data_reader=None, # Deprecated
network=None, feature=None,
startup_program=None, token_feature=None,
config=None, network=None,
hidden_units=None, startup_program=None,
metrics_choices="default"): config=None,
hidden_units=None,
metrics_choices="default"):
""" """
Args: Args:
num_classes: total labels of the text classification task. num_classes: total labels of the text classification task.
feed_list(list): the variable name that will be feeded to the main program feed_list(list): the variable name that will be feeded to the main program, Deprecated in paddlehub v1.8.
data_reader(object): data reader for the task. It must be one of ClassifyReader and LACClassifyReader. data_reader(object): data reader for the task. It must be one of ClassifyReader and LACClassifyReader, Deprecated in paddlehub v1.8..
feature(Variable): the `feature` will be used to classify texts. It must be the sentence-level feature, shape as [-1, emb_size]. `Token_feature` and `feature` couldn't be setted at the same time. One of them must be setted as not None. Default None. feature(Variable): the `feature` will be used to classify texts. It must be the sentence-level feature, shape as [-1, emb_size]. `Token_feature` and `feature` couldn't be setted at the same time. One of them must be setted as not None. Default None.
token_feature(Variable): the `feature` will be used to connect the pre-defined network. It must be the token-level feature, shape as [-1, seq_len, emb_size]. Default None. token_feature(Variable): the `feature` will be used to connect the pre-defined network. It must be the token-level feature, shape as [-1, seq_len, emb_size]. Default None.
network(str): the pre-defined network. Choices: 'bilstm', 'bow', 'cnn', 'dpcnn', 'gru' and 'lstm'. Default None. If network is setted, then `token_feature` must be setted and `feature` must be None. network(str): the pre-defined network. Choices: 'bilstm', 'bow', 'cnn', 'dpcnn', 'gru' and 'lstm'. Default None. If network is setted, then `token_feature` must be setted and `feature` must be None.
...@@ -193,12 +204,12 @@ class TextClassifierTask(ClassifierTask): ...@@ -193,12 +204,12 @@ class TextClassifierTask(ClassifierTask):
""" """
if (not feature) and (not token_feature): if (not feature) and (not token_feature):
logger.error( logger.error(
'Both token_feature and feature are None, one of them must be setted.' 'Both token_feature and feature are None, one of them must be set.'
) )
exit(1) exit(1)
elif feature and token_feature: elif feature and token_feature:
logger.error( logger.error(
'Both token_feature and feature are setted. One should be setted, the other should be None.' 'Both token_feature and feature are set. One should be set, the other should be None.'
) )
exit(1) exit(1)
...@@ -226,6 +237,7 @@ class TextClassifierTask(ClassifierTask): ...@@ -226,6 +237,7 @@ class TextClassifierTask(ClassifierTask):
metrics_choices = ["acc"] metrics_choices = ["acc"]
super(TextClassifierTask, self).__init__( super(TextClassifierTask, self).__init__(
dataset=dataset,
data_reader=data_reader, data_reader=data_reader,
feature=feature if feature else token_feature, feature=feature if feature else token_feature,
num_classes=num_classes, num_classes=num_classes,
...@@ -236,16 +248,14 @@ class TextClassifierTask(ClassifierTask): ...@@ -236,16 +248,14 @@ class TextClassifierTask(ClassifierTask):
metrics_choices=metrics_choices) metrics_choices=metrics_choices)
def _build_net(self): def _build_net(self):
if isinstance(self._base_data_reader, ClassifyReader): if not isinstance(self._base_data_reader, LACClassifyReader):
# ClassifyReader will return the seqence length of an input text # LACClassifyReader wont return the seqence length, while Dataset with tokenizer and ClassifyReader will.
self.seq_len = fluid.layers.data( self.seq_len = fluid.layers.data(
name="seq_len", shape=[1], dtype='int64', lod_level=0) name="seq_len", shape=[1], dtype='int64', lod_level=0)
self.seq_len_used = fluid.layers.squeeze(self.seq_len, axes=[1]) self.seq_len_used = fluid.layers.squeeze(self.seq_len, axes=[1])
# unpad the token_feature # unpad the token_feature
unpad_feature = fluid.layers.sequence_unpad( unpad_feature = fluid.layers.sequence_unpad(
self.feature, length=self.seq_len_used) self.feature, length=self.seq_len_used)
if self.network: if self.network:
# add pre-defined net # add pre-defined net
net_func = getattr(net.classification, self.network) net_func = getattr(net.classification, self.network)
...@@ -254,9 +264,14 @@ class TextClassifierTask(ClassifierTask): ...@@ -254,9 +264,14 @@ class TextClassifierTask(ClassifierTask):
cls_feats = net_func( cls_feats = net_func(
self.feature, emb_dim=self.feature.shape[-1]) self.feature, emb_dim=self.feature.shape[-1])
else: else:
cls_feats = net_func(unpad_feature) if self._compatible_mode and isinstance(self._base_data_reader,
logger.info( LACClassifyReader):
"%s has been added in the TextClassifierTask!" % self.network) cls_feats = net_func(self.feature)
else:
cls_feats = net_func(unpad_feature)
if self.is_train_phase:
logger.info("%s has been added in the TextClassifierTask!" %
self.network)
else: else:
# not use pre-defined net but to use fc net # not use pre-defined net but to use fc net
cls_feats = fluid.layers.dropout( cls_feats = fluid.layers.dropout(
...@@ -286,12 +301,15 @@ class TextClassifierTask(ClassifierTask): ...@@ -286,12 +301,15 @@ class TextClassifierTask(ClassifierTask):
@property @property
def feed_list(self): def feed_list(self):
feed_list = [varname for varname in self._base_feed_list] if self._compatible_mode:
if isinstance(self._base_data_reader, ClassifyReader): feed_list = [varname for varname in self._base_feed_list]
# ClassifyReader will return the seqence length of an input text if isinstance(self._base_data_reader, ClassifyReader):
feed_list += [self.seq_len.name] # ClassifyReader will return the seqence length of an input text
if self.is_train_phase or self.is_test_phase: feed_list += [self.seq_len.name]
feed_list += [self.labels[0].name] if self.is_train_phase or self.is_test_phase:
feed_list += [self.labels[0].name]
else:
feed_list = super(TextClassifierTask, self).feed_list
return feed_list return feed_list
@property @property
...@@ -303,11 +321,10 @@ class TextClassifierTask(ClassifierTask): ...@@ -303,11 +321,10 @@ class TextClassifierTask(ClassifierTask):
] ]
else: else:
# predict phase # predict phase
fetch_list = [self.outputs[0].name] if isinstance(self._base_data_reader, LACClassifyReader):
fetch_list = [self.outputs[0].name]
if isinstance(self._base_data_reader, ClassifyReader): else:
# to avoid save_inference_model to prune seq_len variable fetch_list = [self.outputs[0].name, self.seq_len.name]
fetch_list += [self.seq_len.name]
return fetch_list return fetch_list
...@@ -316,8 +333,9 @@ class MultiLabelClassifierTask(ClassifierTask): ...@@ -316,8 +333,9 @@ class MultiLabelClassifierTask(ClassifierTask):
def __init__(self, def __init__(self,
feature, feature,
num_classes, num_classes,
feed_list, dataset=None,
data_reader, feed_list=None,
data_reader=None,
startup_program=None, startup_program=None,
config=None, config=None,
hidden_units=None, hidden_units=None,
...@@ -325,8 +343,8 @@ class MultiLabelClassifierTask(ClassifierTask): ...@@ -325,8 +343,8 @@ class MultiLabelClassifierTask(ClassifierTask):
if metrics_choices == "default": if metrics_choices == "default":
metrics_choices = ["auc"] metrics_choices = ["auc"]
main_program = feature.block.program
super(MultiLabelClassifierTask, self).__init__( super(MultiLabelClassifierTask, self).__init__(
dataset=dataset,
data_reader=data_reader, data_reader=data_reader,
feature=feature, feature=feature,
num_classes=num_classes, num_classes=num_classes,
...@@ -335,7 +353,10 @@ class MultiLabelClassifierTask(ClassifierTask): ...@@ -335,7 +353,10 @@ class MultiLabelClassifierTask(ClassifierTask):
config=config, config=config,
hidden_units=hidden_units, hidden_units=hidden_units,
metrics_choices=metrics_choices) metrics_choices=metrics_choices)
self.class_name = list(data_reader.label_map.keys()) if self._compatible_mode:
self.class_name = list(data_reader.label_map.keys())
else:
self.class_name = self._label_list
def _build_net(self): def _build_net(self):
cls_feats = fluid.layers.dropout( cls_feats = fluid.layers.dropout(
...@@ -428,13 +449,22 @@ class MultiLabelClassifierTask(ClassifierTask): ...@@ -428,13 +449,22 @@ class MultiLabelClassifierTask(ClassifierTask):
def _postprocessing(self, run_states): def _postprocessing(self, run_states):
results = [] results = []
label_list = list(self._base_data_reader.label_map.keys()) if self._compatible_mode:
label_list = list(self._base_data_reader.label_map.keys())
else:
if self._label_list:
label_list = self._label_list
else:
logger.warning(
"Fail to postprocess the predict output. Please set label_list parameter in predict function or initialize the task with dataset parameter."
)
return run_states
for batch_state in run_states: for batch_state in run_states:
batch_result = batch_state.run_results batch_result = batch_state.run_results
for sample_id in range(len(batch_result[0])): for sample_id in range(len(batch_result[0])):
sample_result = [] sample_result = []
for category_id in range( for category_id in range(len(label_list)):
self._base_data_reader.dataset.num_labels):
sample_category_prob = batch_result[category_id][sample_id] sample_category_prob = batch_result[category_id][sample_id]
sample_category_value = np.argmax(sample_category_prob) sample_category_value = np.argmax(sample_category_prob)
sample_result.append( sample_result.append(
......
...@@ -18,23 +18,22 @@ from __future__ import division ...@@ -18,23 +18,22 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import time import time
import os
import collections import collections
import math import math
import six import six
import json import json
from collections import OrderedDict
import io import io
from tqdm import tqdm
import numpy as np import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
from .base_task import BaseTask
from paddlehub.common.logger import logger from paddlehub.common.logger import logger
from paddlehub.reader import tokenization from paddlehub.reader import tokenization
from paddlehub.finetune.evaluator import squad1_evaluate from paddlehub.finetune.evaluator import squad1_evaluate
from paddlehub.finetune.evaluator import squad2_evaluate from paddlehub.finetune.evaluator import squad2_evaluate
from paddlehub.finetune.evaluator import cmrc2018_evaluate from paddlehub.finetune.evaluator import cmrc2018_evaluate
from .base_task import BaseTask
def _get_best_indexes(logits, n_best_size): def _get_best_indexes(logits, n_best_size):
...@@ -193,183 +192,189 @@ def get_predictions(all_examples, all_features, all_results, n_best_size, ...@@ -193,183 +192,189 @@ def get_predictions(all_examples, all_features, all_results, n_best_size,
all_nbest_json = collections.OrderedDict() all_nbest_json = collections.OrderedDict()
scores_diff_json = collections.OrderedDict() scores_diff_json = collections.OrderedDict()
for (example_index, example) in enumerate(all_examples): logger.info("Post processing...")
features = example_index_to_features[example_index] with tqdm(total=len(all_examples)) as process_bar:
for (example_index, example) in enumerate(all_examples):
prelim_predictions = [] features = example_index_to_features[example_index]
# keep track of the minimum score of null start+end of position 0
score_null = 1000000 # large and positive prelim_predictions = []
min_null_feature_index = 0 # the paragraph slice with min mull score # keep track of the minimum score of null start+end of position 0
null_start_logit = 0 # the start logit at the slice with min null score score_null = 1000000 # large and positive
null_end_logit = 0 # the end logit at the slice with min null score min_null_feature_index = 0 # the paragraph slice with min mull score
for (feature_index, feature) in enumerate(features): null_start_logit = 0 # the start logit at the slice with min null score
if feature.unique_id not in unique_id_to_result: null_end_logit = 0 # the end logit at the slice with min null score
logger.info( for (feature_index, feature) in enumerate(features):
"As using multidevice, the last one batch is so small that the feature %s in the last batch is discarded " if feature.unique_id not in unique_id_to_result:
% feature.unique_id) logger.info(
continue "As using multidevice, the last one batch is so small that the feature %s in the last batch is discarded "
result = unique_id_to_result[feature.unique_id] % feature.unique_id)
start_indexes = _get_best_indexes(result.start_logits, n_best_size) continue
end_indexes = _get_best_indexes(result.end_logits, n_best_size) result = unique_id_to_result[feature.unique_id]
start_indexes = _get_best_indexes(result.start_logits,
n_best_size)
end_indexes = _get_best_indexes(result.end_logits, n_best_size)
# if we could have irrelevant answers, get the min score of irrelevant
if version_2_with_negative:
feature_null_score = result.start_logits[
0] + result.end_logits[0]
if feature_null_score < score_null:
score_null = feature_null_score
min_null_feature_index = feature_index
null_start_logit = result.start_logits[0]
null_end_logit = result.end_logits[0]
for start_index in start_indexes:
for end_index in end_indexes:
# We could hypothetically create invalid predictions, e.g., predict
# that the start of the span is in the question. We throw out all
# invalid predictions.
if start_index >= len(feature.tokens):
continue
if end_index >= len(feature.tokens):
continue
if start_index not in feature.token_to_orig_map:
continue
if end_index not in feature.token_to_orig_map:
continue
if not feature.token_is_max_context.get(
start_index, False):
continue
if end_index < start_index:
continue
length = end_index - start_index + 1
if length > max_answer_length:
continue
prelim_predictions.append(
_PrelimPrediction(
feature_index=feature_index,
start_index=start_index,
end_index=end_index,
start_logit=result.start_logits[start_index],
end_logit=result.end_logits[end_index]))
# if we could have irrelevant answers, get the min score of irrelevant
if version_2_with_negative: if version_2_with_negative:
feature_null_score = result.start_logits[0] + result.end_logits[ prelim_predictions.append(
0] _PrelimPrediction(
if feature_null_score < score_null: feature_index=min_null_feature_index,
score_null = feature_null_score start_index=0,
min_null_feature_index = feature_index end_index=0,
null_start_logit = result.start_logits[0] start_logit=null_start_logit,
null_end_logit = result.end_logits[0] end_logit=null_end_logit))
prelim_predictions = sorted(
for start_index in start_indexes: prelim_predictions,
for end_index in end_indexes: key=lambda x: (x.start_logit + x.end_logit),
# We could hypothetically create invalid predictions, e.g., predict reverse=True)
# that the start of the span is in the question. We throw out all
# invalid predictions. seen_predictions = {}
if start_index >= len(feature.tokens): nbest = []
continue if not prelim_predictions:
if end_index >= len(feature.tokens): logger.warning(("not prelim_predictions:", example.qas_id))
continue for pred in prelim_predictions:
if start_index not in feature.token_to_orig_map: if len(nbest) >= n_best_size:
continue break
if end_index not in feature.token_to_orig_map: feature = features[pred.feature_index]
continue if pred.start_index > 0: # this is a non-null prediction
if not feature.token_is_max_context.get(start_index, False): tok_tokens = feature.tokens[pred.start_index:(
continue pred.end_index + 1)]
if end_index < start_index: orig_doc_start = feature.token_to_orig_map[pred.start_index]
continue orig_doc_end = feature.token_to_orig_map[pred.end_index]
length = end_index - start_index + 1 orig_tokens = example.doc_tokens[orig_doc_start:(
if length > max_answer_length: orig_doc_end + 1)]
if is_english:
tok_text = " ".join(tok_tokens)
else:
tok_text = "".join(tok_tokens)
# De-tokenize WordPieces that have been split off.
tok_text = tok_text.replace(" ##", "")
tok_text = tok_text.replace("##", "")
# Clean whitespace
tok_text = tok_text.strip()
tok_text = " ".join(tok_text.split())
if is_english:
orig_text = " ".join(orig_tokens)
else:
orig_text = "".join(orig_tokens)
final_text = get_final_text(tok_text, orig_text,
do_lower_case, is_english)
if final_text in seen_predictions:
continue continue
prelim_predictions.append(
_PrelimPrediction( seen_predictions[final_text] = True
feature_index=feature_index,
start_index=start_index,
end_index=end_index,
start_logit=result.start_logits[start_index],
end_logit=result.end_logits[end_index]))
if version_2_with_negative:
prelim_predictions.append(
_PrelimPrediction(
feature_index=min_null_feature_index,
start_index=0,
end_index=0,
start_logit=null_start_logit,
end_logit=null_end_logit))
prelim_predictions = sorted(
prelim_predictions,
key=lambda x: (x.start_logit + x.end_logit),
reverse=True)
seen_predictions = {}
nbest = []
if not prelim_predictions:
logger.warning(("not prelim_predictions:", example.qas_id))
for pred in prelim_predictions:
if len(nbest) >= n_best_size:
break
feature = features[pred.feature_index]
if pred.start_index > 0: # this is a non-null prediction
tok_tokens = feature.tokens[pred.start_index:(
pred.end_index + 1)]
orig_doc_start = feature.token_to_orig_map[pred.start_index]
orig_doc_end = feature.token_to_orig_map[pred.end_index]
orig_tokens = example.doc_tokens[orig_doc_start:(
orig_doc_end + 1)]
if is_english:
tok_text = " ".join(tok_tokens)
else:
tok_text = "".join(tok_tokens)
# De-tokenize WordPieces that have been split off.
tok_text = tok_text.replace(" ##", "")
tok_text = tok_text.replace("##", "")
# Clean whitespace
tok_text = tok_text.strip()
tok_text = " ".join(tok_text.split())
if is_english:
orig_text = " ".join(orig_tokens)
else: else:
orig_text = "".join(orig_tokens) final_text = ""
seen_predictions[final_text] = True
final_text = get_final_text(tok_text, orig_text, do_lower_case, nbest.append(
is_english) _NbestPrediction(
if final_text in seen_predictions: text=final_text,
continue start_logit=pred.start_logit,
end_logit=pred.end_logit))
seen_predictions[final_text] = True # if we didn't include the empty option in the n-best, include it
else: if version_2_with_negative:
final_text = "" if "" not in seen_predictions:
seen_predictions[final_text] = True nbest.append(
_NbestPrediction(
nbest.append( text="",
_NbestPrediction( start_logit=null_start_logit,
text=final_text, end_logit=null_end_logit))
start_logit=pred.start_logit, # In very rare edge cases we could have no valid predictions. So we
end_logit=pred.end_logit)) # just create a nonce prediction in this case to avoid failure.
if not nbest:
# if we didn't include the empty option in the n-best, include it
if version_2_with_negative:
if "" not in seen_predictions:
nbest.append( nbest.append(
_NbestPrediction( _NbestPrediction(
text="", text="empty", start_logit=0.0, end_logit=0.0))
start_logit=null_start_logit,
end_logit=null_end_logit)) assert len(nbest) >= 1
# In very rare edge cases we could have no valid predictions. So we
# just create a nonce prediction in this case to avoid failure.
if not nbest:
nbest.append(
_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
assert len(nbest) >= 1
total_scores = []
best_non_null_entry = None
for entry in nbest:
total_scores.append(entry.start_logit + entry.end_logit)
if not best_non_null_entry:
if entry.text:
best_non_null_entry = entry
probs = _compute_softmax(total_scores)
nbest_json = []
for (i, entry) in enumerate(nbest):
output = collections.OrderedDict()
output["text"] = entry.text
output["probability"] = probs[i]
output["start_logit"] = entry.start_logit
output["end_logit"] = entry.end_logit
nbest_json.append(output)
assert len(nbest_json) >= 1
if not version_2_with_negative:
all_predictions[example.qas_id] = nbest_json[0]["text"]
else:
# predict "" iff the null score - the score of best non-null > threshold
score_diff = score_null
if best_non_null_entry:
score_diff -= best_non_null_entry.start_logit + best_non_null_entry.end_logit
scores_diff_json[example.qas_id] = score_diff
if score_diff > null_score_diff_threshold:
all_predictions[example.qas_id] = ""
else:
all_predictions[example.qas_id] = best_non_null_entry.text
all_nbest_json[example.qas_id] = nbest_json total_scores = []
best_non_null_entry = None
for entry in nbest:
total_scores.append(entry.start_logit + entry.end_logit)
if not best_non_null_entry:
if entry.text:
best_non_null_entry = entry
probs = _compute_softmax(total_scores)
nbest_json = []
for (i, entry) in enumerate(nbest):
output = collections.OrderedDict()
output["text"] = entry.text
output["probability"] = probs[i]
output["start_logit"] = entry.start_logit
output["end_logit"] = entry.end_logit
nbest_json.append(output)
assert len(nbest_json) >= 1
if not version_2_with_negative:
all_predictions[example.qas_id] = nbest_json[0]["text"]
else:
# predict "" iff the null score - the score of best non-null > threshold
score_diff = score_null
if best_non_null_entry:
score_diff -= best_non_null_entry.start_logit + best_non_null_entry.end_logit
scores_diff_json[example.qas_id] = score_diff
if score_diff > null_score_diff_threshold:
all_predictions[example.qas_id] = ""
else:
all_predictions[example.qas_id] = best_non_null_entry.text
all_nbest_json[example.qas_id] = nbest_json
process_bar.update(1)
return all_predictions, all_nbest_json, scores_diff_json return all_predictions, all_nbest_json, scores_diff_json
class ReadingComprehensionTask(BaseTask): class ReadingComprehensionTask(BaseTask):
def __init__(self, def __init__(self,
feature, feature,
feed_list, dataset=None,
data_reader, feed_list=None,
data_reader=None,
startup_program=None, startup_program=None,
config=None, config=None,
metrics_choices=None, metrics_choices=None,
...@@ -379,7 +384,9 @@ class ReadingComprehensionTask(BaseTask): ...@@ -379,7 +384,9 @@ class ReadingComprehensionTask(BaseTask):
max_answer_length=30): max_answer_length=30):
main_program = feature.block.program main_program = feature.block.program
self.data_reader = data_reader
super(ReadingComprehensionTask, self).__init__( super(ReadingComprehensionTask, self).__init__(
dataset=dataset,
data_reader=data_reader, data_reader=data_reader,
main_program=main_program, main_program=main_program,
feed_list=feed_list, feed_list=feed_list,
...@@ -387,7 +394,6 @@ class ReadingComprehensionTask(BaseTask): ...@@ -387,7 +394,6 @@ class ReadingComprehensionTask(BaseTask):
config=config, config=config,
metrics_choices=metrics_choices) metrics_choices=metrics_choices)
self.feature = feature self.feature = feature
self.data_reader = data_reader
self.sub_task = sub_task.lower() self.sub_task = sub_task.lower()
self.version_2_with_negative = (self.sub_task == "squad2.0") self.version_2_with_negative = (self.sub_task == "squad2.0")
if self.sub_task in ["squad2.0", "squad"]: if self.sub_task in ["squad2.0", "squad"]:
...@@ -407,10 +413,10 @@ class ReadingComprehensionTask(BaseTask): ...@@ -407,10 +413,10 @@ class ReadingComprehensionTask(BaseTask):
"RawResult", ["unique_id", "start_logits", "end_logits"]) "RawResult", ["unique_id", "start_logits", "end_logits"])
def _build_net(self): def _build_net(self):
self.unique_ids = fluid.layers.data( self.unique_id = fluid.layers.data(
name="unique_ids", shape=[-1, 1], lod_level=0, dtype="int64") name="unique_id", shape=[-1, 1], lod_level=0, dtype="int64")
# to avoid memory optimization # to avoid memory optimization
_ = fluid.layers.assign(self.unique_ids) _ = fluid.layers.assign(self.unique_id)
logits = fluid.layers.fc( logits = fluid.layers.fc(
input=self.feature, input=self.feature,
size=2, size=2,
...@@ -432,24 +438,24 @@ class ReadingComprehensionTask(BaseTask): ...@@ -432,24 +438,24 @@ class ReadingComprehensionTask(BaseTask):
return [start_logits, end_logits, num_seqs] return [start_logits, end_logits, num_seqs]
def _add_label(self): def _add_label(self):
start_positions = fluid.layers.data( start_position = fluid.layers.data(
name="start_positions", shape=[-1, 1], lod_level=0, dtype="int64") name="start_position", shape=[-1, 1], lod_level=0, dtype="int64")
end_positions = fluid.layers.data( end_position = fluid.layers.data(
name="end_positions", shape=[-1, 1], lod_level=0, dtype="int64") name="end_position", shape=[-1, 1], lod_level=0, dtype="int64")
return [start_positions, end_positions] return [start_position, end_position]
def _add_loss(self): def _add_loss(self):
start_positions = self.labels[0] start_position = self.labels[0]
end_positions = self.labels[1] end_position = self.labels[1]
start_logits = self.outputs[0] start_logits = self.outputs[0]
end_logits = self.outputs[1] end_logits = self.outputs[1]
start_loss = fluid.layers.softmax_with_cross_entropy( start_loss = fluid.layers.softmax_with_cross_entropy(
logits=start_logits, label=start_positions) logits=start_logits, label=start_position)
start_loss = fluid.layers.mean(x=start_loss) start_loss = fluid.layers.mean(x=start_loss)
end_loss = fluid.layers.softmax_with_cross_entropy( end_loss = fluid.layers.softmax_with_cross_entropy(
logits=end_logits, label=end_positions) logits=end_logits, label=end_position)
end_loss = fluid.layers.mean(x=end_loss) end_loss = fluid.layers.mean(x=end_loss)
total_loss = (start_loss + end_loss) / 2.0 total_loss = (start_loss + end_loss) / 2.0
return total_loss return total_loss
...@@ -459,22 +465,25 @@ class ReadingComprehensionTask(BaseTask): ...@@ -459,22 +465,25 @@ class ReadingComprehensionTask(BaseTask):
@property @property
def feed_list(self): def feed_list(self):
feed_list = [varname for varname in self._base_feed_list if self._compatible_mode:
] + [self.unique_ids.name] feed_list = [varname for varname in self._base_feed_list
if self.is_train_phase or self.is_test_phase: ] + [self.unique_id.name]
feed_list += [label.name for label in self.labels] if self.is_train_phase or self.is_test_phase:
feed_list += [label.name for label in self.labels]
else:
feed_list = super(ReadingComprehensionTask, self).feed_list
return feed_list return feed_list
@property @property
def fetch_list(self): def fetch_list(self):
if self.is_train_phase or self.is_test_phase: if self.is_train_phase or self.is_test_phase:
return [ return [
self.loss.name, self.outputs[-1].name, self.unique_ids.name, self.loss.name, self.outputs[-1].name, self.unique_id.name,
self.outputs[0].name, self.outputs[1].name self.outputs[0].name, self.outputs[1].name
] ]
elif self.is_predict_phase: elif self.is_predict_phase:
return [ return [
self.unique_ids.name, self.unique_id.name,
] + [output.name for output in self.outputs] ] + [output.name for output in self.outputs]
def _calculate_metrics(self, run_states): def _calculate_metrics(self, run_states):
...@@ -503,11 +512,17 @@ class ReadingComprehensionTask(BaseTask): ...@@ -503,11 +512,17 @@ class ReadingComprehensionTask(BaseTask):
run_time_used = time.time() - run_states[0].run_time_begin run_time_used = time.time() - run_states[0].run_time_begin
run_speed = run_step / run_time_used run_speed = run_step / run_time_used
avg_loss = np.sum(total_cost) / np.sum(total_num_seqs) avg_loss = np.sum(total_cost) / np.sum(total_num_seqs)
scores = OrderedDict() scores = collections.OrderedDict()
# If none of metrics has been implemented, loss will be used to evaluate. # If none of metrics has been implemented, loss will be used to evaluate.
if self.is_test_phase: if self.is_test_phase:
all_examples = self.data_reader.all_examples[self.phase] if self._compatible_mode:
all_features = self.data_reader.all_features[self.phase] all_examples = self.data_reader.all_examples[self.phase]
all_features = self.data_reader.all_features[self.phase]
dataset = self.data_reader.dataset
else:
all_examples = self.dataset.get_examples(self.phase)
all_features = self.dataset.get_features(self.phase)
dataset = self.dataset
all_predictions, all_nbest_json, scores_diff_json = get_predictions( all_predictions, all_nbest_json, scores_diff_json = get_predictions(
all_examples=all_examples, all_examples=all_examples,
all_features=all_features, all_features=all_features,
...@@ -519,28 +534,23 @@ class ReadingComprehensionTask(BaseTask): ...@@ -519,28 +534,23 @@ class ReadingComprehensionTask(BaseTask):
null_score_diff_threshold=self.null_score_diff_threshold, null_score_diff_threshold=self.null_score_diff_threshold,
is_english=self.is_english) is_english=self.is_english)
if self.phase == 'val' or self.phase == 'dev': if self.phase == 'val' or self.phase == 'dev':
with io.open( dataset_path = dataset.dev_path
self.data_reader.dataset.dev_path, 'r',
encoding="utf8") as dataset_file:
dataset_json = json.load(dataset_file)
dataset = dataset_json['data']
elif self.phase == 'test': elif self.phase == 'test':
with io.open( dataset_path = dataset.test_path
self.data_reader.dataset.test_path, 'r',
encoding="utf8") as dataset_file:
dataset_json = json.load(dataset_file)
dataset = dataset_json['data']
else: else:
raise Exception("Error phase: %s when runing _calculate_metrics" raise Exception("Error phase: %s when runing _calculate_metrics"
% self.phase) % self.phase)
with io.open(dataset_path, 'r', encoding="utf8") as dataset_file:
dataset_json = json.load(dataset_file)
data = dataset_json['data']
if self.sub_task == "squad": if self.sub_task == "squad":
scores = squad1_evaluate.evaluate(dataset, all_predictions) scores = squad1_evaluate.evaluate(data, all_predictions)
elif self.sub_task == "squad2.0": elif self.sub_task == "squad2.0":
scores = squad2_evaluate.evaluate(dataset, all_predictions, scores = squad2_evaluate.evaluate(data, all_predictions,
scores_diff_json) scores_diff_json)
elif self.sub_task in ["cmrc2018", "drcd"]: elif self.sub_task in ["cmrc2018", "drcd"]:
scores = cmrc2018_evaluate.get_eval(dataset, all_predictions) scores = cmrc2018_evaluate.get_eval(data, all_predictions)
return scores, avg_loss, run_speed return scores, avg_loss, run_speed
def _postprocessing(self, run_states): def _postprocessing(self, run_states):
...@@ -558,8 +568,12 @@ class ReadingComprehensionTask(BaseTask): ...@@ -558,8 +568,12 @@ class ReadingComprehensionTask(BaseTask):
unique_id=unique_id, unique_id=unique_id,
start_logits=start_logits, start_logits=start_logits,
end_logits=end_logits)) end_logits=end_logits))
all_examples = self.data_reader.all_examples[self.phase] if self._compatible_mode:
all_features = self.data_reader.all_features[self.phase] all_examples = self.data_reader.all_examples[self.phase]
all_features = self.data_reader.all_features[self.phase]
else:
all_examples = self.dataset.get_examples(self.phase)
all_features = self.dataset.get_features(self.phase)
all_predictions, all_nbest_json, scores_diff_json = get_predictions( all_predictions, all_nbest_json, scores_diff_json = get_predictions(
all_examples=all_examples, all_examples=all_examples,
all_features=all_features, all_features=all_features,
......
...@@ -29,8 +29,9 @@ from .base_task import BaseTask ...@@ -29,8 +29,9 @@ from .base_task import BaseTask
class RegressionTask(BaseTask): class RegressionTask(BaseTask):
def __init__(self, def __init__(self,
feature, feature,
feed_list, dataset=None,
data_reader, feed_list=None,
data_reader=None,
startup_program=None, startup_program=None,
config=None, config=None,
hidden_units=None, hidden_units=None,
...@@ -40,6 +41,7 @@ class RegressionTask(BaseTask): ...@@ -40,6 +41,7 @@ class RegressionTask(BaseTask):
main_program = feature.block.program main_program = feature.block.program
super(RegressionTask, self).__init__( super(RegressionTask, self).__init__(
dataset=dataset,
data_reader=data_reader, data_reader=data_reader,
main_program=main_program, main_program=main_program,
feed_list=feed_list, feed_list=feed_list,
......
...@@ -21,10 +21,9 @@ import time ...@@ -21,10 +21,9 @@ import time
from collections import OrderedDict from collections import OrderedDict
import numpy as np import numpy as np
import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddlehub.finetune.evaluate import chunk_eval, calculate_f1 from paddlehub.finetune.evaluate import chunk_eval, calculate_f1
from paddlehub.common.utils import version_compare from paddlehub.common.logger import logger
from .base_task import BaseTask from .base_task import BaseTask
...@@ -33,8 +32,9 @@ class SequenceLabelTask(BaseTask): ...@@ -33,8 +32,9 @@ class SequenceLabelTask(BaseTask):
feature, feature,
max_seq_len, max_seq_len,
num_classes, num_classes,
feed_list, dataset=None,
data_reader, feed_list=None,
data_reader=None,
startup_program=None, startup_program=None,
config=None, config=None,
metrics_choices="default", metrics_choices="default",
...@@ -46,6 +46,7 @@ class SequenceLabelTask(BaseTask): ...@@ -46,6 +46,7 @@ class SequenceLabelTask(BaseTask):
main_program = feature.block.program main_program = feature.block.program
super(SequenceLabelTask, self).__init__( super(SequenceLabelTask, self).__init__(
dataset=dataset,
data_reader=data_reader, data_reader=data_reader,
main_program=main_program, main_program=main_program,
feed_list=feed_list, feed_list=feed_list,
...@@ -199,11 +200,14 @@ class SequenceLabelTask(BaseTask): ...@@ -199,11 +200,14 @@ class SequenceLabelTask(BaseTask):
@property @property
def feed_list(self): def feed_list(self):
feed_list = [varname for varname in self._base_feed_list] if self._compatible_mode:
if self.is_train_phase or self.is_test_phase: feed_list = [varname for varname in self._base_feed_list]
feed_list += [self.labels[0].name, self.seq_len.name] if self.is_train_phase or self.is_test_phase:
feed_list += [self.labels[0].name, self.seq_len.name]
else:
feed_list += [self.seq_len.name]
else: else:
feed_list += [self.seq_len.name] feed_list = super(SequenceLabelTask, self).feed_list
return feed_list return feed_list
@property @property
...@@ -215,10 +219,22 @@ class SequenceLabelTask(BaseTask): ...@@ -215,10 +219,22 @@ class SequenceLabelTask(BaseTask):
return [output.name for output in self.outputs] return [output.name for output in self.outputs]
def _postprocessing(self, run_states): def _postprocessing(self, run_states):
id2label = { if self._compatible_mode:
val: key id2label = {
for key, val in self._base_data_reader.label_map.items() val: key
} for key, val in self._base_data_reader.label_map.items()
}
else:
if self._label_list:
id2label = {}
for index, label in enumerate(self._label_list):
id2label[index] = label
else:
logger.warning(
"Fail to postprocess the predict output. Please set label_list parameter in predict function or initialize the task with dataset parameter."
)
return run_states
results = [] results = []
for batch_states in run_states: for batch_states in run_states:
batch_results = batch_states.run_results batch_results = batch_states.run_results
......
...@@ -688,11 +688,13 @@ class Features(object): ...@@ -688,11 +688,13 @@ class Features(object):
s = "" s = ""
s += "unique_id: %s " % self.unique_id s += "unique_id: %s " % self.unique_id
s += "example_index: %s " % self.example_index s += "example_index: %s " % self.example_index
s += "doc_span_index: %s" % self.doc_span_index
s += "tokens: %s" % self.tokens
s += "token_to_orig_map %s" % self.token_to_orig_map
s += "token_is_max_context %s" % self.token_is_max_context
s += "start_position: %s " % self.start_position s += "start_position: %s " % self.start_position
s += "end_position: %s " % self.end_position s += "end_position: %s " % self.end_position
s += "is_impossible: %s " % self.is_impossible s += "is_impossible: %s " % self.is_impossible
# s += "tokens: %s" % self.tokens
# s += "token_to_orig_map %s" % self.token_to_orig_map
return s return s
......
...@@ -140,29 +140,6 @@ class FullTokenizer(object): ...@@ -140,29 +140,6 @@ class FullTokenizer(object):
return convert_by_vocab(self.inv_vocab, ids) return convert_by_vocab(self.inv_vocab, ids)
class CharTokenizer(object):
"""Runs end-to-end tokenziation."""
def __init__(self, vocab_file, do_lower_case=True):
self.vocab = load_vocab(vocab_file)
self.inv_vocab = {v: k for k, v in self.vocab.items()}
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
def tokenize(self, text):
split_tokens = []
for token in text.lower().split(" "):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)
return split_tokens
def convert_tokens_to_ids(self, tokens):
return convert_by_vocab(self.vocab, tokens)
def convert_ids_to_tokens(self, ids):
return convert_by_vocab(self.inv_vocab, ids)
class WSSPTokenizer(object): class WSSPTokenizer(object):
def __init__(self, vocab_file, sp_model_dir, word_dict, ws=True, def __init__(self, vocab_file, sp_model_dir, word_dict, ws=True,
lower=True): lower=True):
......
from .bert_tokenizer import BertTokenizer
from .bert_tokenizer import ErnieTinyTokenizer
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This file is modified from https://github.com/huggingface/transformers"""
import collections
import os
import unicodedata
import pickle
from typing import Dict, List, Optional, Union, Tuple
import sentencepiece as spm
from .tokenizer_util import load_vocab, is_whitespace, is_control, is_punctuation, whitespace_tokenize, is_chinese_char
class BasicTokenizer(object):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
def __init__(self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True):
""" Constructs a BasicTokenizer.
Args:
do_lower_case: Whether to lower case the input.
never_split: (`optional`) list of str
List of token not to split.
tokenize_chinese_chars: (`optional`) boolean (default True)
Whether to tokenize Chinese characters.
This should likely be deactivated for Japanese:
see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
"""
if never_split is None:
never_split = []
self.do_lower_case = do_lower_case
self.never_split = never_split
self.tokenize_chinese_chars = tokenize_chinese_chars
def tokenize(self, text, never_split=None):
""" Basic Tokenization of a piece of text.
Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer.
Args:
**never_split**: (`optional`) list of str
List of token not to split.
"""
never_split = self.never_split + (never_split
if never_split is not None else [])
text = self._clean_text(text)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
if self.tokenize_chinese_chars:
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
split_tokens = []
for token in orig_tokens:
if self.do_lower_case and token not in never_split:
token = token.lower()
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token, never_split))
output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if never_split is not None and text in never_split:
return [text]
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
if is_chinese_char(char):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xFFFD or is_control(char):
continue
if is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
def encode(self):
raise NotImplementedError(
"This tokenizer can only do tokenize(...), "
"the ability to convert tokens to ids has not been implemented")
def decode(self):
raise NotImplementedError(
"This tokenizer can only do tokenize(...), "
"the ability to convert ids to tokens has not been implemented")
class WordpieceTokenizer(object):
"""Runs WordPiece tokenization."""
def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.
For example:
input = "unaffable"
output = ["un", "##aff", "##able"]
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer`.
Returns:
A list of wordpiece tokens.
"""
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens
def encode(self):
raise NotImplementedError(
"This tokenizer can only do tokenize(...), "
"the ability to convert tokens to ids has not been implemented")
def decode(self):
raise NotImplementedError(
"This tokenizer can only do tokenize(...), "
"the ability to convert ids to tokens has not been implemented")
class BertTokenizer(object):
"""
Constructs a BERT tokenizer. Based on WordPiece.
Args:
vocab_file (:obj:`string`):
File containing the vocabulary.
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to lowercase the input when tokenizing.
do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to do basic tokenization before WordPiece.
never_split (:obj:`bool`, `optional`, defaults to :obj:`True`):
List of tokens which will never be split during tokenization. Only has an effect when
:obj:`do_basic_tokenize=True`
unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
The token used for padding, for example when batching sequences of different lengths.
cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to tokenize Chinese characters.
This should likely be deactivated for Japanese:
see: https://github.com/huggingface/transformers/issues/328
"""
def __init__(
self,
vocab_file,
do_lower_case=True,
do_basic_tokenize=True,
never_split=None,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
tokenize_chinese_chars=True,
):
self.unk_token = unk_token
self.sep_token = sep_token
self.pad_token = pad_token
self.cls_token = cls_token
self.mask_token = mask_token
self.do_lower_case = do_lower_case
self.all_special_tokens = [
unk_token, sep_token, pad_token, cls_token, mask_token
]
if not os.path.isfile(vocab_file):
raise ValueError(
"Can't find a vocabulary file at path '{}'.".format(vocab_file))
self.vocab = load_vocab(vocab_file)
self.ids_to_tokens = collections.OrderedDict(
[(ids, tok) for tok, ids in self.vocab.items()])
self.do_basic_tokenize = do_basic_tokenize
if do_basic_tokenize:
self.basic_tokenizer = BasicTokenizer(
do_lower_case=do_lower_case,
never_split=never_split,
tokenize_chinese_chars=tokenize_chinese_chars)
self.wordpiece_tokenizer = WordpieceTokenizer(
vocab=self.vocab, unk_token=self.unk_token)
self.unk_token_id = self.convert_tokens_to_ids(self.unk_token)
self.sep_token_id = self.convert_tokens_to_ids(self.sep_token)
self.pad_token_id = self.convert_tokens_to_ids(self.pad_token)
self.pad_token_type_id = 0
self.cls_token_id = self.convert_tokens_to_ids(self.cls_token)
self.mask_token_id = self.convert_tokens_to_ids(self.mask_token)
self.all_special_ids = self.convert_tokens_to_ids(
self.all_special_tokens)
@property
def vocab_size(self):
return len(self.vocab)
def get_vocab(self):
return dict(self.vocab)
def _convert_token_to_id(self, token):
""" Converts a token (str) in an id using the vocab. """
return self.vocab.get(token, self.vocab.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.ids_to_tokens.get(index, self.unk_token)
def convert_tokens_to_string(self, tokens):
""" Converts a sequence of tokens (string) in a single string. """
out_string = " ".join(tokens).replace(" ##", "").strip()
return out_string
def convert_tokens_to_ids(self, tokens):
""" Converts a token string (or a sequence of tokens) in a single integer id
(or a sequence of ids), using the vocabulary.
"""
if tokens is None:
return None
if isinstance(tokens, str):
return self._convert_token_to_id(tokens)
ids = []
for token in tokens:
ids.append(self._convert_token_to_id(token))
return ids
def convert_ids_to_tokens(self,
ids: Union[int, List[int]],
skip_special_tokens: bool = False
) -> Union[int, List[int]]:
""" Converts a single index or a sequence of indices (integers) in a token "
(resp.) a sequence of tokens (str), using the vocabulary and added tokens.
Args:
skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False
"""
if isinstance(ids, int):
return self._convert_id_to_token(ids)
tokens = []
for index in ids:
index = int(index)
if skip_special_tokens and index in self.all_special_ids:
continue
tokens.append(self._convert_id_to_token(index))
return tokens
def tokenize(self, text):
""" Converts a string in a sequence of tokens (string), using the tokenizer.
Split in words for word-based vocabulary or sub-words for sub-word-based
vocabularies (BPE/SentencePieces/WordPieces).
Take care of added tokens.
Args:
text (:obj:`string`): The sequence to be encoded.
"""
split_tokens = []
if self.do_basic_tokenize:
for token in self.basic_tokenizer.tokenize(
text, never_split=self.all_special_tokens):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)
else:
split_tokens = self.wordpiece_tokenizer.tokenize(text)
return split_tokens
def build_inputs_with_special_tokens(self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
A BERT sequence has the following format:
- single sequence: ``[CLS] X [SEP]``
- pair of sequences: ``[CLS] A [SEP] B [SEP]``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: list of `input IDs` with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep
def num_special_tokens_to_add(self, pair=False):
"""
Returns the number of added tokens when encoding a sequence with special tokens.
Note:
This encodes inputs and checks the number of added tokens, and is therefore not efficient. Do not put this
inside your training loop.
Args:
pair: Returns the number of added tokens in the case of a sequence pair if set to True, returns the
number of added tokens in the case of a single sequence if set to False.
Returns:
Number of tokens added to sequences
"""
token_ids_0 = []
token_ids_1 = []
return len(
self.build_inputs_with_special_tokens(
token_ids_0, token_ids_1 if pair else None))
def get_special_tokens_mask(
self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None,
already_has_special_tokens: bool = False) -> List[int]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True if the token list is already formatted with special tokens for the model
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
)
return list(
map(
lambda x: 1 if x in [self.sep_token_id, self.cls_token_id]
else 0, token_ids_0))
if token_ids_1 is not None:
return [1] + ([0] * len(token_ids_0)) + [1] + (
[0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1]
def create_segment_ids_from_sequences(
self, token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None) -> List[int]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A BERT sequence pair mask has the following format:
::
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
if token_ids_1 is None, only returns the first portion of the mask (0's).
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `token type IDs` according to the given sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def clean_up_tokenization(self, out_string: str) -> str:
""" Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms.
"""
out_string = (out_string.replace(" .", ".").replace(" ?", "?").replace(
" !", "!").replace(" ,", ",").replace(" ' ", "'").replace(
" n't",
"n't").replace(" 'm", "'m").replace(" 's", "'s").replace(
" 've", "'ve").replace(" 're", "'re"))
return out_string
def truncate_sequences(
self,
ids: List[int],
pair_ids: Optional[List[int]] = None,
num_tokens_to_remove: int = 0,
truncation_strategy: str = "longest_first",
stride: int = 0,
) -> Tuple[List[int], List[int], List[int]]:
""" Truncates a sequence pair in place to the maximum length.
Args:
ids: list of tokenized input ids. Can be obtained from a string by chaining the
`tokenize` and `convert_tokens_to_ids` methods.
pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the
`tokenize` and `convert_tokens_to_ids` methods.
num_tokens_to_remove (:obj:`int`, `optional`, defaults to ``0``):
number of tokens to remove using the truncation strategy
truncation_strategy: string selected in the following options:
- 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_seq_len
starting from the longest one at each token (when there is a pair of input sequences).
Overflowing tokens only contains overflow from the first sequence.
- 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove.
- 'only_second': Only truncate the second sequence
- 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_seq_len)
stride (:obj:`int`, `optional`, defaults to ``0``):
If set to a number along with max_seq_len, the overflowing tokens returned will contain some tokens
from the main sequence returned. The value of this argument defines the number of additional tokens.
"""
if num_tokens_to_remove <= 0:
return ids, pair_ids, []
if truncation_strategy == "longest_first":
overflowing_tokens = []
for _ in range(num_tokens_to_remove):
if pair_ids is None or len(ids) > len(pair_ids):
overflowing_tokens = [ids[-1]] + overflowing_tokens
ids = ids[:-1]
else:
pair_ids = pair_ids[:-1]
window_len = min(len(ids), stride)
if window_len > 0:
overflowing_tokens = ids[-window_len:] + overflowing_tokens
elif truncation_strategy == "only_first":
assert len(ids) > num_tokens_to_remove
window_len = min(len(ids), stride + num_tokens_to_remove)
overflowing_tokens = ids[-window_len:]
ids = ids[:-num_tokens_to_remove]
elif truncation_strategy == "only_second":
assert pair_ids is not None and len(pair_ids) > num_tokens_to_remove
window_len = min(len(pair_ids), stride + num_tokens_to_remove)
overflowing_tokens = pair_ids[-window_len:]
pair_ids = pair_ids[:-num_tokens_to_remove]
elif truncation_strategy == "do_not_truncate":
raise ValueError(
"Input sequence are too long for max_seq_len. Please select a truncation strategy."
)
else:
raise ValueError(
"Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']"
)
return (ids, pair_ids, overflowing_tokens)
def encode(self,
text: Union[str, List[str], List[int]],
text_pair: Optional[Union[str, List[str], List[int]]] = None,
max_seq_len: Optional[int] = None,
pad_to_max_seq_len: bool = True,
truncation_strategy: str = "longest_first",
return_position_ids: bool = True,
return_segment_ids: bool = True,
return_input_mask: bool = True,
return_length: bool = True,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False):
"""
Returns a dictionary containing the encoded sequence or sequence pair and additional information:
the mask for sequence classification and the overflowing elements if a ``max_seq_len`` is specified.
Args:
text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`):
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
method)
text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
string using the `tokenize` method) or a list of integers (tokenized string ids using the
`convert_tokens_to_ids` method)
max_seq_len (:obj:`int`, `optional`, defaults to :int:`None`):
If set to a number, will limit the total sequence returned so that it has a maximum length.
If there are overflowing tokens, those will be added to the returned dictionary
pad_to_max_seq_len (:obj:`bool`, `optional`, defaults to :obj:`True`):
If set to True, the returned sequences will be padded according to the model's padding side and
padding index, up to their max length. If no max length is specified, the padding is done up to the
model's max length.
truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`):
String selected in the following options:
- 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_seq_len
starting from the longest one at each token (when there is a pair of input sequences)
- 'only_first': Only truncate the first sequence
- 'only_second': Only truncate the second sequence
- 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_seq_len)
return_position_ids (:obj:`bool`, `optional`, defaults to :obj:`True`):
Set to True to return tokens position ids (default True).
return_segment_ids (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to return token type IDs.
return_input_mask (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to return the attention mask.
return_length (:obj:`int`, defaults to :obj:`True`):
If set the resulting dictionary will include the length of each encoded inputs
return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True to return overflowing token information (default False).
return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True to return special tokens mask information (default False).
Return:
A Dictionary of shape::
{
input_ids: list[int],
position_ids: list[int] if return_position_ids is True (default)
segment_ids: list[int] if return_segment_ids is True (default)
input_mask: list[int] if return_input_mask is True (default)
seq_len: int if return_length is True (default)
overflowing_tokens: list[int] if a ``max_seq_len`` is specified and return_overflowing_tokens is True
num_truncated_tokens: int if a ``max_seq_len`` is specified and return_overflowing_tokens is True
special_tokens_mask: list[int] if return_special_tokens_mask is True
}
With the fields:
- ``input_ids``: list of token ids to be fed to a model
- ``position_ids``: list of token position ids to be fed to a model
- ``segment_ids``: list of token type ids to be fed to a model
- ``input_mask``: list of indices specifying which tokens should be attended to by the model
- ``length``: the input_ids length
- ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
- ``num_truncated_tokens``: number of overflowing tokens a ``max_seq_len`` is specified
- ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
tokens and 1 specifying sequence tokens.
"""
def get_input_ids(text):
if isinstance(text, str):
tokens = self.tokenize(text)
return self.convert_tokens_to_ids(tokens)
elif isinstance(text,
(list, tuple)) and len(text) > 0 and isinstance(
text[0], str):
return self.convert_tokens_to_ids(text)
elif isinstance(text,
(list, tuple)) and len(text) > 0 and isinstance(
text[0], int):
return text
else:
raise ValueError(
"Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
)
ids = get_input_ids(text)
pair_ids = get_input_ids(text_pair) if text_pair is not None else None
pair = bool(pair_ids is not None)
len_ids = len(ids)
len_pair_ids = len(pair_ids) if pair else 0
encoded_inputs = {}
# Truncation: Handle max sequence length
total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(
pair=pair))
if max_seq_len and total_len > max_seq_len:
ids, pair_ids, overflowing_tokens = self.truncate_sequences(
ids,
pair_ids=pair_ids,
num_tokens_to_remove=total_len - max_seq_len,
truncation_strategy=truncation_strategy,
)
if return_overflowing_tokens:
encoded_inputs["overflowing_tokens"] = overflowing_tokens
encoded_inputs["num_truncated_tokens"] = total_len - max_seq_len
# Add special tokens
sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
segment_ids = self.create_segment_ids_from_sequences(ids, pair_ids)
# Build output dictionnary
encoded_inputs["input_ids"] = sequence
if return_segment_ids:
encoded_inputs["segment_ids"] = segment_ids
if return_special_tokens_mask:
encoded_inputs[
"special_tokens_mask"] = self.get_special_tokens_mask(
ids, pair_ids)
if return_length:
encoded_inputs["seq_len"] = len(encoded_inputs["input_ids"])
# Check lengths
assert max_seq_len is None or len(
encoded_inputs["input_ids"]) <= max_seq_len
# Padding
needs_to_be_padded = pad_to_max_seq_len and \
max_seq_len and len(encoded_inputs["input_ids"]) < max_seq_len
if needs_to_be_padded:
difference = max_seq_len - len(encoded_inputs["input_ids"])
if return_input_mask:
encoded_inputs["input_mask"] = [1] * len(
encoded_inputs["input_ids"]) + [0] * difference
if return_segment_ids:
encoded_inputs["segment_ids"] = (
encoded_inputs["segment_ids"] +
[self.pad_token_type_id] * difference)
if return_special_tokens_mask:
encoded_inputs["special_tokens_mask"] = encoded_inputs[
"special_tokens_mask"] + [1] * difference
encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [
self.pad_token_id
] * difference
else:
if return_input_mask:
encoded_inputs["input_mask"] = [1] * len(
encoded_inputs["input_ids"])
if return_position_ids:
encoded_inputs["position_ids"] = list(
range(len(encoded_inputs["input_ids"])))
return encoded_inputs
def decode(self,
token_ids: Union[List[int], Dict],
only_convert_to_tokens: bool = False,
skip_pad_token: bool = False,
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True):
"""
Converts a sequence of ids (integer) to a string if only_convert_to_tokens is False or a list a sequence of tokens (str)
when only_convert_to_tokens is True.
Args:
token_ids: list of tokenized input ids or dict containing a key called "input_ids", can be obtained using the `encode` methods.
only_convert_to_tokens: if set to True, will only return a list a sequence of tokens (str). `paddlehub.dataset.base_nlp_dataset` will use this optional argument.
skip_pad_token: if set to True, will replace pad tokens.
skip_special_tokens: if set to True, will replace special tokens.
clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces.
"""
if isinstance(token_ids, dict):
token_ids = token_ids["input_ids"]
filtered_tokens = self.convert_ids_to_tokens(
token_ids, skip_special_tokens=skip_special_tokens)
tokens = []
for token in filtered_tokens:
if skip_pad_token and token == self.pad_token:
continue
tokens.append(token)
if only_convert_to_tokens:
return tokens
if tokens:
text = self.convert_tokens_to_string(tokens)
else:
text = ""
if clean_up_tokenization_spaces:
clean_text = self.clean_up_tokenization(text)
return clean_text
else:
return text
class ErnieTinyTokenizer(BertTokenizer):
def __init__(
self,
vocab_file,
spm_path,
word_dict_path,
do_lower_case=True,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
):
self.unk_token = unk_token
self.sep_token = sep_token
self.pad_token = pad_token
self.cls_token = cls_token
self.mask_token = mask_token
self.do_lower_case = do_lower_case
self.all_special_tokens = [
unk_token, sep_token, pad_token, cls_token, mask_token
]
if not os.path.isfile(vocab_file):
raise ValueError(
"Can't find a vocabulary file at path '{}'.".format(vocab_file))
self.vocab = load_vocab(vocab_file)
self.ids_to_tokens = collections.OrderedDict(
[(ids, tok) for tok, ids in self.vocab.items()])
# Here is the difference with BertTokenizer.
self.dict = pickle.load(open(word_dict_path, 'rb'))
self.sp_model = spm.SentencePieceProcessor()
self.window_size = 5
self.sp_model.Load(spm_path)
self.unk_token_id = self.convert_tokens_to_ids(self.unk_token)
self.sep_token_id = self.convert_tokens_to_ids(self.sep_token)
self.pad_token_id = self.convert_tokens_to_ids(self.pad_token)
self.pad_token_type_id = 0
self.cls_token_id = self.convert_tokens_to_ids(self.cls_token)
self.mask_token_id = self.convert_tokens_to_ids(self.mask_token)
self.all_special_ids = self.convert_tokens_to_ids(
self.all_special_tokens)
def cut(self, chars):
words = []
idx = 0
while idx < len(chars):
matched = False
for i in range(self.window_size, 0, -1):
cand = chars[idx:idx + i]
if cand in self.dict:
words.append(cand)
matched = True
break
if not matched:
i = 1
words.append(chars[idx])
idx += i
return words
def tokenize(self, text):
text = [s for s in self.cut(text) if s != ' ']
if self.do_lower_case:
text = [s.lower() for s in text]
text = ' '.join(text)
tokens = self.sp_model.EncodeAsPieces(text)
in_vocab_tokens = []
for token in tokens:
if token in self.vocab:
in_vocab_tokens.append(token)
else:
in_vocab_tokens.append(self.unk_token)
return in_vocab_tokens
from collections import OrderedDict
import unicodedata
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = {}
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n").split("\t")[0]
vocab[token] = index
return vocab
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
def is_whitespace(char):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
cat = unicodedata.category(char)
if cat == "Zs":
return True
return False
def is_control(char):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if char == "\t" or char == "\n" or char == "\r":
return False
cat = unicodedata.category(char)
if cat.startswith("C"):
return True
return False
def is_punctuation(char):
"""Checks whether `chars` is a punctuation character."""
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (
cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
return True
cat = unicodedata.category(char)
if cat.startswith("P"):
return True
return False
def is_chinese_char(char):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
cp = ord(char)
if ((cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF) #
or (cp >= 0x20000 and cp <= 0x2A6DF) #
or (cp >= 0x2A700 and cp <= 0x2B73F) #
or (cp >= 0x2B740 and cp <= 0x2B81F) #
or (cp >= 0x2B820 and cp <= 0x2CEAF) #
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F) #
): #
return True
return False
...@@ -8,6 +8,7 @@ visualdl >= 2.0.0b ...@@ -8,6 +8,7 @@ visualdl >= 2.0.0b
cma >= 2.7.0 cma >= 2.7.0
sentencepiece sentencepiece
colorlog colorlog
tqdm
# pandas no longer support python2 in version 0.25 and above # pandas no longer support python2 in version 0.25 and above
pandas ; python_version >= "3" pandas ; python_version >= "3"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册