未验证 提交 a1cf32cd 编写于 作者: K kinghuin 提交者: GitHub

Tokenizer refactor (#677)

上级 a253ecaa
...@@ -39,18 +39,17 @@ if __name__ == '__main__': ...@@ -39,18 +39,17 @@ if __name__ == '__main__':
inputs, outputs, program = module.context( inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len) trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use MultiLabelReader to read dataset # Use the appropriate tokenizer to preprocess the data set
dataset = hub.dataset.Toxic() # For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
reader = hub.reader.MultiLabelClassifyReader( if module.name == "ernie_tiny":
dataset=dataset, tokenizer = hub.ErnieTinyTokenizer(
vocab_path=module.get_vocab_path(), vocab_file=module.get_vocab_path(),
max_seq_len=args.max_seq_len) spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
# Setup feed list for data feeder else:
feed_list = [ tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
inputs["input_ids"].name, inputs["position_ids"].name, dataset = hub.dataset.Toxic(
inputs["segment_ids"].name, inputs["input_mask"].name tokenizer=tokenizer, max_seq_len=args.max_seq_len)
]
# Construct transfer learning network # Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence. # Use "pooled_output" for classification tasks on an entire sentence.
...@@ -72,9 +71,8 @@ if __name__ == '__main__': ...@@ -72,9 +71,8 @@ if __name__ == '__main__':
# Define a classfication fine-tune task by PaddleHub's API # Define a classfication fine-tune task by PaddleHub's API
multi_label_cls_task = hub.MultiLabelClassifierTask( multi_label_cls_task = hub.MultiLabelClassifierTask(
data_reader=reader, dataset=dataset,
feature=pooled_output, feature=pooled_output,
feed_list=feed_list,
num_classes=dataset.num_labels, num_classes=dataset.num_labels,
config=config) config=config)
......
...@@ -45,20 +45,11 @@ if __name__ == '__main__': ...@@ -45,20 +45,11 @@ if __name__ == '__main__':
inputs, outputs, program = module.context( inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len) trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use MultiLabelReader to read dataset # Download dataset and get its label list and label num
# If you just want labels information, you can omit its tokenizer parameter to avoid preprocessing the train set.
dataset = hub.dataset.Toxic() dataset = hub.dataset.Toxic()
reader = hub.reader.MultiLabelClassifyReader( num_classes = dataset.num_labels
dataset=dataset, label_list = dataset.get_labels()
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len)
# Setup feed list for data feeder
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Construct transfer learning network # Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence. # Use "pooled_output" for classification tasks on an entire sentence.
...@@ -75,20 +66,29 @@ if __name__ == '__main__': ...@@ -75,20 +66,29 @@ if __name__ == '__main__':
# Define a classfication fine-tune task by PaddleHub's API # Define a classfication fine-tune task by PaddleHub's API
multi_label_cls_task = hub.MultiLabelClassifierTask( multi_label_cls_task = hub.MultiLabelClassifierTask(
data_reader=reader, dataset=dataset,
feature=pooled_output, feature=pooled_output,
feed_list=feed_list,
num_classes=dataset.num_labels, num_classes=dataset.num_labels,
config=config) config=config)
# Data to be predicted # Data to be predicted
data = [ data = [
[ "Yes you did. And you admitted to doing it. See the Warren Kinsella talk page.",
"Yes you did. And you admitted to doing it. See the Warren Kinsella talk page." "I asked you a question. We both know you have my page on your watch list, so are why are you playing games and making me formally ping you? Makin'Bacon",
],
[
"I asked you a question. We both know you have my page on your watch list, so are why are you playing games and making me formally ping you? Makin'Bacon"
],
] ]
# Use the appropriate tokenizer to preprocess the data
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
print(multi_label_cls_task.predict(data=data, return_result=True)) encoded_data = [
tokenizer.encode(text=text, max_seq_len=args.max_seq_len)
for text in data
]
print(
multi_label_cls_task.predict(data=encoded_data, label_list=label_list))
...@@ -36,31 +36,28 @@ args = parser.parse_args() ...@@ -36,31 +36,28 @@ args = parser.parse_args()
if __name__ == '__main__': if __name__ == '__main__':
# Load Paddlehub ERNIE pretrained model # Load Paddlehub ERNIE pretrained model
module = hub.Module(name="ernie") module = hub.Module(name="ernie_tiny")
inputs, outputs, program = module.context( inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len) trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use ClassifyReader to read dataset # Use the appropriate tokenizer to preprocess the data set
dataset = hub.dataset.NLPCC_DBQA() # For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
reader = hub.reader.ClassifyReader( if module.name == "ernie_tiny":
dataset=dataset, tokenizer = hub.ErnieTinyTokenizer(
vocab_path=module.get_vocab_path(), vocab_file=module.get_vocab_path(),
max_seq_len=args.max_seq_len) spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
dataset = hub.dataset.NLPCC_DBQA(
tokenizer=tokenizer, max_seq_len=args.max_seq_len)
# Construct transfer learning network # Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence. # Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output. # Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"] pooled_output = outputs["pooled_output"]
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Select fine-tune strategy, setup config and fine-tune # Select fine-tune strategy, setup config and fine-tune
strategy = hub.AdamWeightDecayStrategy( strategy = hub.AdamWeightDecayStrategy(
warmup_proportion=args.warmup_proportion, warmup_proportion=args.warmup_proportion,
...@@ -78,9 +75,8 @@ if __name__ == '__main__': ...@@ -78,9 +75,8 @@ if __name__ == '__main__':
# Define a classfication fine-tune task by PaddleHub's API # Define a classfication fine-tune task by PaddleHub's API
cls_task = hub.TextClassifierTask( cls_task = hub.TextClassifierTask(
data_reader=reader, dataset=dataset,
feature=pooled_output, feature=pooled_output,
feed_list=feed_list,
num_classes=dataset.num_labels, num_classes=dataset.num_labels,
config=config) config=config)
......
...@@ -39,30 +39,20 @@ args = parser.parse_args() ...@@ -39,30 +39,20 @@ args = parser.parse_args()
if __name__ == '__main__': if __name__ == '__main__':
# loading Paddlehub ERNIE pretrained model # loading Paddlehub ERNIE pretrained model
module = hub.Module(name="ernie") module = hub.Module(name="ernie_tiny")
inputs, outputs, program = module.context(max_seq_len=args.max_seq_len) inputs, outputs, program = module.context(max_seq_len=args.max_seq_len)
# Sentence classification dataset reader # Download dataset and get its label list and label num
# If you just want labels information, you can omit its tokenizer parameter to avoid preprocessing the train set.
dataset = hub.dataset.NLPCC_DBQA() dataset = hub.dataset.NLPCC_DBQA()
reader = hub.reader.ClassifyReader( num_classes = dataset.num_labels
dataset=dataset, label_list = dataset.get_labels()
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len)
# Construct transfer learning network # Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence. # Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output. # Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"] pooled_output = outputs["pooled_output"]
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Setup RunConfig for PaddleHub Fine-tune API # Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig( config = hub.RunConfig(
use_data_parallel=False, use_data_parallel=False,
...@@ -73,9 +63,8 @@ if __name__ == '__main__': ...@@ -73,9 +63,8 @@ if __name__ == '__main__':
# Define a classfication fine-tune task by PaddleHub's API # Define a classfication fine-tune task by PaddleHub's API
cls_task = hub.TextClassifierTask( cls_task = hub.TextClassifierTask(
data_reader=reader, dataset=dataset,
feature=pooled_output, feature=pooled_output,
feed_list=feed_list,
num_classes=dataset.num_labels, num_classes=dataset.num_labels,
config=config) config=config)
...@@ -83,5 +72,18 @@ if __name__ == '__main__': ...@@ -83,5 +72,18 @@ if __name__ == '__main__':
data = [["北京奥运博物馆的场景效果负责人是谁?", "主要承担奥运文物征集、保管、研究和爱国主义教育基地建设相关工作。"], data = [["北京奥运博物馆的场景效果负责人是谁?", "主要承担奥运文物征集、保管、研究和爱国主义教育基地建设相关工作。"],
["北京奥运博物馆的场景效果负责人是谁", "于海勃,美国加利福尼亚大学教授 场景效果负责人 总设计师"], ["北京奥运博物馆的场景效果负责人是谁", "于海勃,美国加利福尼亚大学教授 场景效果负责人 总设计师"],
["北京奥运博物馆的场景效果负责人是谁?", "洪麦恩,清华大学美术学院教授 内容及主展线负责人 总设计师"]] ["北京奥运博物馆的场景效果负责人是谁?", "洪麦恩,清华大学美术学院教授 内容及主展线负责人 总设计师"]]
# Use the appropriate tokenizer to preprocess the data
print(cls_task.predict(data=data, return_result=True)) # For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
encoded_data = [
tokenizer.encode(
text=text, text_pair=text_pair, max_seq_len=args.max_seq_len)
for text, text_pair in data
]
print(cls_task.predict(data=encoded_data, label_list=label_list))
...@@ -17,7 +17,6 @@ ...@@ -17,7 +17,6 @@
import argparse import argparse
import ast import ast
import paddle.fluid as fluid
import paddlehub as hub import paddlehub as hub
hub.common.logger.logger.setLevel("INFO") hub.common.logger.logger.setLevel("INFO")
...@@ -42,28 +41,23 @@ if __name__ == '__main__': ...@@ -42,28 +41,23 @@ if __name__ == '__main__':
inputs, outputs, program = module.context( inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len) trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use ReadingComprehensionReader to read dataset # Use the appropriate tokenizer to preprocess the data set
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
# If you wanna load SQuAD 2.0 dataset, just set version_2_with_negative as True # If you wanna load SQuAD 2.0 dataset, just set version_2_with_negative as True
dataset = hub.dataset.SQUAD(version_2_with_negative=False) dataset = hub.dataset.SQUAD(
version_2_with_negative=False,
tokenizer=tokenizer,
max_seq_len=args.max_seq_len)
# dataset = hub.dataset.SQUAD(version_2_with_negative=True) # dataset = hub.dataset.SQUAD(version_2_with_negative=True)
reader = hub.reader.ReadingComprehensionReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len,
doc_stride=128,
max_query_length=64)
seq_output = outputs["sequence_output"]
# Setup feed list for data feeder
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Select fine-tune strategy, setup config and fine-tune # Select fine-tune strategy, setup config and fine-tune
strategy = hub.AdamWeightDecayStrategy( strategy = hub.AdamWeightDecayStrategy(
weight_decay=args.weight_decay, weight_decay=args.weight_decay,
...@@ -72,7 +66,7 @@ if __name__ == '__main__': ...@@ -72,7 +66,7 @@ if __name__ == '__main__':
# Setup RunConfig for PaddleHub Fine-tune API # Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig( config = hub.RunConfig(
eval_interval=300, eval_interval=100,
use_data_parallel=args.use_data_parallel, use_data_parallel=args.use_data_parallel,
use_cuda=args.use_gpu, use_cuda=args.use_gpu,
num_epoch=args.num_epoch, num_epoch=args.num_epoch,
...@@ -82,9 +76,8 @@ if __name__ == '__main__': ...@@ -82,9 +76,8 @@ if __name__ == '__main__':
# Define a reading comprehension fine-tune task by PaddleHub's API # Define a reading comprehension fine-tune task by PaddleHub's API
reading_comprehension_task = hub.ReadingComprehensionTask( reading_comprehension_task = hub.ReadingComprehensionTask(
data_reader=reader, dataset=dataset,
feature=seq_output, feature=outputs["sequence_output"],
feed_list=feed_list,
config=config, config=config,
sub_task="squad", sub_task="squad",
) )
......
...@@ -20,12 +20,6 @@ from __future__ import print_function ...@@ -20,12 +20,6 @@ from __future__ import print_function
import argparse import argparse
import ast import ast
import numpy as np
import os
import time
import paddle
import paddle.fluid as fluid
import paddlehub as hub import paddlehub as hub
# yapf: disable # yapf: disable
...@@ -43,27 +37,11 @@ if __name__ == '__main__': ...@@ -43,27 +37,11 @@ if __name__ == '__main__':
inputs, outputs, program = module.context( inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len) trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use RegressionReader to read dataset
dataset = hub.dataset.GLUE("STS-B")
reader = hub.reader.RegressionReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len)
# Construct transfer learning network # Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence. # Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output. # Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"] pooled_output = outputs["pooled_output"]
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Setup RunConfig for PaddleHub Fine-tune API # Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig( config = hub.RunConfig(
use_data_parallel=False, use_data_parallel=False,
...@@ -74,13 +52,22 @@ if __name__ == '__main__': ...@@ -74,13 +52,22 @@ if __name__ == '__main__':
# Define a regression fine-tune task by PaddleHub's API # Define a regression fine-tune task by PaddleHub's API
reg_task = hub.RegressionTask( reg_task = hub.RegressionTask(
data_reader=reader,
feature=pooled_output, feature=pooled_output,
feed_list=feed_list,
config=config, config=config,
) )
# Data to be prdicted # STS-B has provided the predict data, and the dataset has process it. If you want to process customized data,
data = [[d.text_a, d.text_b] for d in dataset.get_predict_examples()[:10]] # see the predict.py in text_classification demo
# Use the appropriate tokenizer to preprocess the data
print(reg_task.predict(data=data, return_result=True)) # For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
dataset = hub.dataset.GLUE(
"STS-B", tokenizer=tokenizer, max_seq_len=args.max_seq_len)
encoded_data = dataset.get_predict_records()[:10]
print(reg_task.predict(data=encoded_data))
...@@ -17,7 +17,6 @@ ...@@ -17,7 +17,6 @@
import argparse import argparse
import ast import ast
import paddle.fluid as fluid
import paddlehub as hub import paddlehub as hub
# yapf: disable # yapf: disable
...@@ -41,27 +40,24 @@ if __name__ == '__main__': ...@@ -41,27 +40,24 @@ if __name__ == '__main__':
inputs, outputs, program = module.context( inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len) trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use RegressionReader to read dataset # Use the appropriate tokenizer to preprocess the data set
dataset = hub.dataset.GLUE("STS-B") # For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
reader = hub.reader.RegressionReader( if module.name == "ernie_tiny":
dataset=dataset, tokenizer = hub.ErnieTinyTokenizer(
vocab_path=module.get_vocab_path(), vocab_file=module.get_vocab_path(),
max_seq_len=args.max_seq_len) spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
dataset = hub.dataset.GLUE(
"STS-B", tokenizer=tokenizer, max_seq_len=args.max_seq_len)
# Construct transfer learning network # Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence. # Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output. # Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"] pooled_output = outputs["pooled_output"]
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Select fine-tune strategy, setup config and fine-tune # Select fine-tune strategy, setup config and fine-tune
strategy = hub.AdamWeightDecayStrategy( strategy = hub.AdamWeightDecayStrategy(
warmup_proportion=args.warmup_proportion, warmup_proportion=args.warmup_proportion,
...@@ -70,7 +66,6 @@ if __name__ == '__main__': ...@@ -70,7 +66,6 @@ if __name__ == '__main__':
# Setup RunConfig for PaddleHub Fine-tune API # Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig( config = hub.RunConfig(
eval_interval=300,
use_data_parallel=args.use_data_parallel, use_data_parallel=args.use_data_parallel,
use_cuda=args.use_gpu, use_cuda=args.use_gpu,
num_epoch=args.num_epoch, num_epoch=args.num_epoch,
...@@ -80,10 +75,7 @@ if __name__ == '__main__': ...@@ -80,10 +75,7 @@ if __name__ == '__main__':
# Define a regression fine-tune task by PaddleHub's API # Define a regression fine-tune task by PaddleHub's API
reg_task = hub.RegressionTask( reg_task = hub.RegressionTask(
data_reader=reader, dataset=dataset, feature=pooled_output, config=config)
feature=pooled_output,
feed_list=feed_list,
config=config)
# Fine-tune and evaluate by PaddleHub's API # Fine-tune and evaluate by PaddleHub's API
# will finish training, evaluation, testing, save model automatically # will finish training, evaluation, testing, save model automatically
......
...@@ -42,30 +42,16 @@ if __name__ == '__main__': ...@@ -42,30 +42,16 @@ if __name__ == '__main__':
module = hub.Module(name="ernie_tiny") module = hub.Module(name="ernie_tiny")
inputs, outputs, program = module.context(max_seq_len=args.max_seq_len) inputs, outputs, program = module.context(max_seq_len=args.max_seq_len)
# Sentence labeling dataset reader # Download dataset and get its label list and label num
# If you just want labels information, you can omit its tokenizer parameter to avoid preprocessing the train set.
dataset = hub.dataset.MSRA_NER() dataset = hub.dataset.MSRA_NER()
reader = hub.reader.SequenceLabelReader( num_classes = dataset.num_labels
dataset=dataset, label_list = dataset.get_labels()
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len,
sp_model_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
inv_label_map = {val: key for key, val in reader.label_map.items()}
# Construct transfer learning network # Construct transfer learning network
# Use "sequence_output" for token-level output. # Use "sequence_output" for token-level output.
sequence_output = outputs["sequence_output"] sequence_output = outputs["sequence_output"]
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Setup RunConfig for PaddleHub Fine-tune API # Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig( config = hub.RunConfig(
use_data_parallel=False, use_data_parallel=False,
...@@ -77,33 +63,31 @@ if __name__ == '__main__': ...@@ -77,33 +63,31 @@ if __name__ == '__main__':
# Define a sequence labeling fine-tune task by PaddleHub's API # Define a sequence labeling fine-tune task by PaddleHub's API
# if add crf, the network use crf as decoder # if add crf, the network use crf as decoder
seq_label_task = hub.SequenceLabelTask( seq_label_task = hub.SequenceLabelTask(
data_reader=reader,
feature=sequence_output, feature=sequence_output,
feed_list=feed_list,
max_seq_len=args.max_seq_len, max_seq_len=args.max_seq_len,
num_classes=dataset.num_labels, num_classes=num_classes,
config=config, config=config,
add_crf=False) add_crf=False)
# Data to be predicted # Data to be predicted
# If using python 2, prefix "u" is necessary text_a = [
data = [ "我们变而以书会友,以书结缘,把欧美、港台流行的食品类图谱、画册、工具书汇集一堂。",
[u"我们变而以书会友,以书结缘,把欧美、港台流行的食品类图谱、画册、工具书汇集一堂。"], "为了跟踪国际最新食品工艺、流行趋势,大量搜集海外专业书刊资料是提高技艺的捷径。",
[u"为了跟踪国际最新食品工艺、流行趋势,大量搜集海外专业书刊资料是提高技艺的捷径。"], "其中线装古籍逾千册;民国出版物几百种;珍本四册、稀见本四百余册,出版时间跨越三百余年。",
[u"其中线装古籍逾千册;民国出版物几百种;珍本四册、稀见本四百余册,出版时间跨越三百余年。"], "有的古木交柯,春机荣欣,从诗人句中得之,而入画中,观之令人心驰。",
[u"有的古木交柯,春机荣欣,从诗人句中得之,而入画中,观之令人心驰。"], "不过重在晋趣,略增明人气息,妙在集古有道、不露痕迹罢了。",
[u"不过重在晋趣,略增明人气息,妙在集古有道、不露痕迹罢了。"],
] ]
# Add 0x02 between characters to match the format of training data, # Add 0x02 between characters to match the format of training data,
# otherwise the length of prediction results will not match the input string # otherwise the length of prediction results will not match the input string
# if the input string contains non-Chinese characters. # if the input string contains non-Chinese characters.
tmp_data = [] formatted_text_a = list(map("\002".join, text_a))
for example in data:
formatted = []
for sentence in example:
formatted.append('\x02'.join(list(sentence)))
tmp_data.append(formatted)
data = tmp_data
print(seq_label_task.predict(data=data, return_result=True)) # Use the appropriate tokenizer to preprocess the data
# For ernie_tiny, it use BertTokenizer too.
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
encoded_data = [
tokenizer.encode(text=text, max_seq_len=args.max_seq_len)
for text in formatted_text_a
]
print(seq_label_task.predict(data=encoded_data, label_list=label_list))
...@@ -40,26 +40,16 @@ if __name__ == '__main__': ...@@ -40,26 +40,16 @@ if __name__ == '__main__':
inputs, outputs, program = module.context( inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len) trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use SequenceLabelReader to read dataset # Use the appropriate tokenizer to preprocess the data set
dataset = hub.dataset.MSRA_NER() # For ernie_tiny, it use BertTokenizer too.
reader = hub.reader.SequenceLabelReader( tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
dataset=dataset, dataset = hub.dataset.MSRA_NER(
vocab_path=module.get_vocab_path(), tokenizer=tokenizer, max_seq_len=args.max_seq_len)
max_seq_len=args.max_seq_len,
sp_model_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
# Construct transfer learning network # Construct transfer learning network
# Use "sequence_output" for token-level output. # Use "sequence_output" for token-level output.
sequence_output = outputs["sequence_output"] sequence_output = outputs["sequence_output"]
# Setup feed list for data feeder
# Must feed all the tensor of module need
feed_list = [
inputs["input_ids"].name, inputs["position_ids"].name,
inputs["segment_ids"].name, inputs["input_mask"].name
]
# Select a fine-tune strategy # Select a fine-tune strategy
strategy = hub.AdamWeightDecayStrategy( strategy = hub.AdamWeightDecayStrategy(
warmup_proportion=args.warmup_proportion, warmup_proportion=args.warmup_proportion,
...@@ -78,9 +68,8 @@ if __name__ == '__main__': ...@@ -78,9 +68,8 @@ if __name__ == '__main__':
# Define a sequence labeling fine-tune task by PaddleHub's API # Define a sequence labeling fine-tune task by PaddleHub's API
# If add crf, the network use crf as decoder # If add crf, the network use crf as decoder
seq_label_task = hub.SequenceLabelTask( seq_label_task = hub.SequenceLabelTask(
data_reader=reader, dataset=dataset,
feature=sequence_output, feature=sequence_output,
feed_list=feed_list,
max_seq_len=args.max_seq_len, max_seq_len=args.max_seq_len,
num_classes=dataset.num_labels, num_classes=dataset.num_labels,
config=config, config=config,
......
...@@ -21,9 +21,9 @@ parser.add_argument("--max_seq_len", type=int, default=512, ...@@ -21,9 +21,9 @@ parser.add_argument("--max_seq_len", type=int, default=512,
# yapf: enable. # yapf: enable.
class TransformerSequenceLabelLayer(fluid.dygraph.Layer): class TransformerSeqLabeling(fluid.dygraph.Layer):
def __init__(self, num_classes, transformer): def __init__(self, num_classes, transformer):
super(TransformerSequenceLabelLayer, self).__init__() super(TransformerSeqLabeling, self).__init__()
self.num_classes = num_classes self.num_classes = num_classes
self.transformer = transformer self.transformer = transformer
self.fc = Linear(input_dim=768, output_dim=num_classes) self.fc = Linear(input_dim=768, output_dim=num_classes)
...@@ -39,11 +39,15 @@ class TransformerSequenceLabelLayer(fluid.dygraph.Layer): ...@@ -39,11 +39,15 @@ class TransformerSequenceLabelLayer(fluid.dygraph.Layer):
def finetune(args): def finetune(args):
ernie = hub.Module(name="ernie", max_seq_len=args.max_seq_len) module = hub.Module(name="ernie", max_seq_len=args.max_seq_len)
# Use the appropriate tokenizer to preprocess the data set
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
dataset = hub.dataset.MSRA_NER(
tokenizer=tokenizer, max_seq_len=args.max_seq_len)
with fluid.dygraph.guard(): with fluid.dygraph.guard():
dataset = hub.dataset.MSRA_NER() ts = TransformerSeqLabeling(
ts = TransformerSequenceLabelLayer( num_classes=dataset.num_labels, transformer=module)
num_classes=dataset.num_labels, transformer=ernie)
adam = AdamOptimizer(learning_rate=1e-5, parameter_list=ts.parameters()) adam = AdamOptimizer(learning_rate=1e-5, parameter_list=ts.parameters())
state_dict_path = os.path.join(args.checkpoint_dir, state_dict_path = os.path.join(args.checkpoint_dir,
'dygraph_state_dict') 'dygraph_state_dict')
...@@ -51,34 +55,32 @@ def finetune(args): ...@@ -51,34 +55,32 @@ def finetune(args):
state_dict, _ = fluid.load_dygraph(state_dict_path) state_dict, _ = fluid.load_dygraph(state_dict_path)
ts.load_dict(state_dict) ts.load_dict(state_dict)
reader = hub.reader.SequenceLabelReader(
dataset=dataset,
vocab_path=ernie.get_vocab_path(),
max_seq_len=args.max_seq_len,
sp_model_path=ernie.get_spm_path(),
word_dict_path=ernie.get_word_dict_path())
train_reader = reader.data_generator(
batch_size=args.batch_size, phase='train')
loss_sum = total_infer = total_label = total_correct = cnt = 0 loss_sum = total_infer = total_label = total_correct = cnt = 0
# 执行epoch_num次训练
for epoch in range(args.num_epoch): for epoch in range(args.num_epoch):
# 读取训练数据进行训练 for batch_id, data in enumerate(
for batch_id, data in enumerate(train_reader()): dataset.batch_records_generator(
input_ids = np.array(data[0][0]).astype(np.int64) phase="train",
position_ids = np.array(data[0][1]).astype(np.int64) batch_size=args.batch_size,
segment_ids = np.array(data[0][2]).astype(np.int64) shuffle=True,
input_mask = np.array(data[0][3]).astype(np.float32) pad_to_batch_max_seq_len=False)):
labels = np.array(data[0][4]).astype(np.int64).reshape(-1, 1) batch_size = len(data["input_ids"])
seq_len = np.squeeze( input_ids = np.array(data["input_ids"]).astype(
np.array(data[0][5]).astype(np.int64), axis=1) np.int64).reshape([batch_size, -1, 1])
position_ids = np.array(data["position_ids"]).astype(
np.int64).reshape([batch_size, -1, 1])
segment_ids = np.array(data["segment_ids"]).astype(
np.int64).reshape([batch_size, -1, 1])
input_mask = np.array(data["input_mask"]).astype(
np.float32).reshape([batch_size, -1, 1])
labels = np.array(data["label"]).astype(np.int64).reshape(-1, 1)
seq_len = np.array(data["seq_len"]).astype(np.int64).reshape(
-1, 1)
pred, ret_infers = ts(input_ids, position_ids, segment_ids, pred, ret_infers = ts(input_ids, position_ids, segment_ids,
input_mask) input_mask)
loss = fluid.layers.cross_entropy(pred, to_variable(labels)) loss = fluid.layers.cross_entropy(pred, to_variable(labels))
avg_loss = fluid.layers.mean(loss) avg_loss = fluid.layers.mean(loss)
avg_loss.backward() avg_loss.backward()
# 参数更新
adam.minimize(avg_loss) adam.minimize(avg_loss)
loss_sum += avg_loss.numpy() * labels.shape[0] loss_sum += avg_loss.numpy() * labels.shape[0]
......
...@@ -20,11 +20,7 @@ from __future__ import print_function ...@@ -20,11 +20,7 @@ from __future__ import print_function
import argparse import argparse
import ast import ast
import numpy as np
import os
import time
import paddle
import paddle.fluid as fluid
import paddlehub as hub import paddlehub as hub
# yapf: disable # yapf: disable
...@@ -43,32 +39,11 @@ if __name__ == '__main__': ...@@ -43,32 +39,11 @@ if __name__ == '__main__':
inputs, outputs, program = module.context( inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len) trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use accuracy as metrics # Download dataset and get its label list and label num
# Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC # If you just want labels information, you can omit its tokenizer parameter to avoid preprocessing the train set.
dataset = hub.dataset.ChnSentiCorp() dataset = hub.dataset.ChnSentiCorp()
num_classes = dataset.num_labels
# For ernie_tiny, it use sub-word to tokenize chinese sentence label_list = dataset.get_labels()
# If not ernie tiny, sp_model_path and word_dict_path should be set None
reader = hub.reader.ClassifyReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len,
sp_model_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"]
# Setup feed list for data feeder
# Must feed all the tensor of module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Setup RunConfig for PaddleHub Fine-tune API # Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig( config = hub.RunConfig(
...@@ -80,14 +55,26 @@ if __name__ == '__main__': ...@@ -80,14 +55,26 @@ if __name__ == '__main__':
# Define a classfication fine-tune task by PaddleHub's API # Define a classfication fine-tune task by PaddleHub's API
cls_task = hub.TextClassifierTask( cls_task = hub.TextClassifierTask(
data_reader=reader, feature=outputs["pooled_output"],
feature=pooled_output, num_classes=num_classes,
feed_list=feed_list,
num_classes=dataset.num_labels,
config=config) config=config)
# Data to be prdicted # Data to be prdicted
data = [["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"], ["交通方便;环境很好;服务态度很好 房间较小"], text_a = [
["19天硬盘就罢工了~~~算上运来的一周都没用上15天~~~可就是不能换了~~~唉~~~~你说这算什么事呀~~~"]] "这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般", "交通方便;环境很好;服务态度很好 房间较小",
"19天硬盘就罢工了~~~算上运来的一周都没用上15天~~~可就是不能换了~~~唉~~~~你说这算什么事呀~~~"
print(cls_task.predict(data=data, return_result=True)) ]
# Use the appropriate tokenizer to preprocess the data
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
encoded_data = [
tokenizer.encode(text=text, max_seq_len=args.max_seq_len)
for text in text_a
]
print(cls_task.predict(data=encoded_data, label_list=label_list))
...@@ -20,11 +20,7 @@ from __future__ import print_function ...@@ -20,11 +20,7 @@ from __future__ import print_function
import argparse import argparse
import ast import ast
import numpy as np
import os
import time
import paddle
import paddle.fluid as fluid
import paddlehub as hub import paddlehub as hub
# yapf: disable # yapf: disable
...@@ -44,33 +40,17 @@ if __name__ == '__main__': ...@@ -44,33 +40,17 @@ if __name__ == '__main__':
inputs, outputs, program = module.context( inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len) trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use accuracy as metrics # Download dataset and get its label list and label num
# Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC # If you just want labels information, you can omit its tokenizer parameter to avoid preprocessing the train set.
dataset = hub.dataset.ChnSentiCorp() dataset = hub.dataset.ChnSentiCorp()
num_classes = dataset.num_labels
# For ernie_tiny, it use sub-word to tokenize chinese sentence label_list = dataset.get_labels()
# If not ernie tiny, sp_model_path and word_dict_path should be set None
reader = hub.reader.ClassifyReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len,
sp_model_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
# Construct transfer learning network # Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence. # Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output. # Use "sequence_output" for token-level output.
token_feature = outputs["sequence_output"] token_feature = outputs["sequence_output"]
# Setup feed list for data feeder
# Must feed all the tensor of module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Setup RunConfig for PaddleHub Fine-tune API # Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig( config = hub.RunConfig(
use_data_parallel=args.use_data_parallel, use_data_parallel=args.use_data_parallel,
...@@ -85,15 +65,27 @@ if __name__ == '__main__': ...@@ -85,15 +65,27 @@ if __name__ == '__main__':
# you must use the outputs["sequence_output"] as the token_feature of TextClassifierTask, # you must use the outputs["sequence_output"] as the token_feature of TextClassifierTask,
# rather than outputs["pooled_output"], and feature is None # rather than outputs["pooled_output"], and feature is None
cls_task = hub.TextClassifierTask( cls_task = hub.TextClassifierTask(
data_reader=reader,
token_feature=token_feature, token_feature=token_feature,
feed_list=feed_list,
network=args.network, network=args.network,
num_classes=dataset.num_labels, num_classes=dataset.num_labels,
config=config) config=config)
# Data to be prdicted # Data to be prdicted
data = [["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"], ["交通方便;环境很好;服务态度很好 房间较小"], text_a = [
["19天硬盘就罢工了~~~算上运来的一周都没用上15天~~~可就是不能换了~~~唉~~~~你说这算什么事呀~~~"]] "这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般", "交通方便;环境很好;服务态度很好 房间较小",
"19天硬盘就罢工了~~~算上运来的一周都没用上15天~~~可就是不能换了~~~唉~~~~你说这算什么事呀~~~"
print(cls_task.predict(data=data, return_result=True)) ]
# Use the appropriate tokenizer to preprocess the data
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
encoded_data = [
tokenizer.encode(text=text, max_seq_len=args.max_seq_len)
for text in text_a
]
print(cls_task.predict(data=encoded_data, label_list=label_list))
...@@ -7,5 +7,5 @@ python -u predict_predefine_net.py \ ...@@ -7,5 +7,5 @@ python -u predict_predefine_net.py \
--checkpoint_dir=$CKPT_DIR \ --checkpoint_dir=$CKPT_DIR \
--max_seq_len=128 \ --max_seq_len=128 \
--use_gpu=True \ --use_gpu=True \
--batch_size=24 \ --batch_size=1 \
--network=bilstm --network=bilstm
...@@ -40,11 +40,23 @@ class TransformerClassifier(fluid.dygraph.Layer): ...@@ -40,11 +40,23 @@ class TransformerClassifier(fluid.dygraph.Layer):
def finetune(args): def finetune(args):
ernie = hub.Module(name="ernie", max_seq_len=args.max_seq_len) module = hub.Module(name="ernie", max_seq_len=args.max_seq_len)
# Use the appropriate tokenizer to preprocess the data set
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path(),
)
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
dataset = hub.dataset.ChnSentiCorp(
tokenizer=tokenizer, max_seq_len=args.max_seq_len)
with fluid.dygraph.guard(): with fluid.dygraph.guard():
dataset = hub.dataset.ChnSentiCorp()
tc = TransformerClassifier( tc = TransformerClassifier(
num_classes=dataset.num_labels, transformer=ernie) num_classes=dataset.num_labels, transformer=module)
adam = AdamOptimizer(learning_rate=1e-5, parameter_list=tc.parameters()) adam = AdamOptimizer(learning_rate=1e-5, parameter_list=tc.parameters())
state_dict_path = os.path.join(args.checkpoint_dir, state_dict_path = os.path.join(args.checkpoint_dir,
'dygraph_state_dict') 'dygraph_state_dict')
...@@ -52,32 +64,31 @@ def finetune(args): ...@@ -52,32 +64,31 @@ def finetune(args):
state_dict, _ = fluid.load_dygraph(state_dict_path) state_dict, _ = fluid.load_dygraph(state_dict_path)
tc.load_dict(state_dict) tc.load_dict(state_dict)
reader = hub.reader.ClassifyReader(
dataset=dataset,
vocab_path=ernie.get_vocab_path(),
max_seq_len=args.max_seq_len,
sp_model_path=ernie.get_spm_path(),
word_dict_path=ernie.get_word_dict_path())
train_reader = reader.data_generator(
batch_size=args.batch_size, phase='train')
loss_sum = acc_sum = cnt = 0 loss_sum = acc_sum = cnt = 0
# 执行epoch_num次训练
for epoch in range(args.num_epoch): for epoch in range(args.num_epoch):
# 读取训练数据进行训练 for batch_id, data in enumerate(
for batch_id, data in enumerate(train_reader()): dataset.batch_records_generator(
input_ids = np.array(data[0][0]).astype(np.int64) phase="train",
position_ids = np.array(data[0][1]).astype(np.int64) batch_size=args.batch_size,
segment_ids = np.array(data[0][2]).astype(np.int64) shuffle=True,
input_mask = np.array(data[0][3]).astype(np.float32) pad_to_batch_max_seq_len=False)):
labels = np.array(data[0][4]).astype(np.int64) batch_size = len(data["input_ids"])
input_ids = np.array(data["input_ids"]).astype(
np.int64).reshape([batch_size, -1, 1])
position_ids = np.array(data["position_ids"]).astype(
np.int64).reshape([batch_size, -1, 1])
segment_ids = np.array(data["segment_ids"]).astype(
np.int64).reshape([batch_size, -1, 1])
input_mask = np.array(data["input_mask"]).astype(
np.float32).reshape([batch_size, -1, 1])
labels = np.array(data["label"]).astype(np.int64).reshape(
[batch_size, 1])
pred = tc(input_ids, position_ids, segment_ids, input_mask) pred = tc(input_ids, position_ids, segment_ids, input_mask)
acc = fluid.layers.accuracy(pred, to_variable(labels)) acc = fluid.layers.accuracy(pred, to_variable(labels))
loss = fluid.layers.cross_entropy(pred, to_variable(labels)) loss = fluid.layers.cross_entropy(pred, to_variable(labels))
avg_loss = fluid.layers.mean(loss) avg_loss = fluid.layers.mean(loss)
avg_loss.backward() avg_loss.backward()
# 参数更新
adam.minimize(avg_loss) adam.minimize(avg_loss)
loss_sum += avg_loss.numpy() * labels.shape[0] loss_sum += avg_loss.numpy() * labels.shape[0]
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
import argparse import argparse
import ast import ast
import paddlehub as hub import paddlehub as hub
# yapf: disable # yapf: disable
...@@ -39,35 +40,24 @@ if __name__ == '__main__': ...@@ -39,35 +40,24 @@ if __name__ == '__main__':
inputs, outputs, program = module.context( inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len) trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use accuracy as metrics # Use the appropriate tokenizer to preprocess the data set
# Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC # For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
# metric should be acc, f1 or matthews if module.name == "ernie_tiny":
dataset = hub.dataset.ChnSentiCorp() tokenizer = hub.ErnieTinyTokenizer(
metrics_choices = ["acc"] vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
# For ernie_tiny, it use sub-word to tokenize chinese sentence dataset = hub.dataset.ChnSentiCorp(
# If not ernie tiny, sp_model_path and word_dict_path should be set None tokenizer=tokenizer, max_seq_len=args.max_seq_len)
reader = hub.reader.ClassifyReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len,
sp_model_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
# Construct transfer learning network # Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence. # Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output. # Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"] pooled_output = outputs["pooled_output"]
# Setup feed list for data feeder
# Must feed all the tensor of module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Select fine-tune strategy, setup config and fine-tune # Select fine-tune strategy, setup config and fine-tune
strategy = hub.AdamWeightDecayStrategy( strategy = hub.AdamWeightDecayStrategy(
warmup_proportion=args.warmup_proportion, warmup_proportion=args.warmup_proportion,
...@@ -85,12 +75,11 @@ if __name__ == '__main__': ...@@ -85,12 +75,11 @@ if __name__ == '__main__':
# Define a classfication fine-tune task by PaddleHub's API # Define a classfication fine-tune task by PaddleHub's API
cls_task = hub.TextClassifierTask( cls_task = hub.TextClassifierTask(
data_reader=reader, dataset=dataset,
feature=pooled_output, feature=pooled_output,
feed_list=feed_list,
num_classes=dataset.num_labels, num_classes=dataset.num_labels,
config=config, config=config,
metrics_choices=metrics_choices) metrics_choices=["acc"])
# Fine-tune and evaluate by PaddleHub's API # Fine-tune and evaluate by PaddleHub's API
# will finish training, evaluation, testing, save model automatically # will finish training, evaluation, testing, save model automatically
......
...@@ -40,35 +40,24 @@ if __name__ == '__main__': ...@@ -40,35 +40,24 @@ if __name__ == '__main__':
inputs, outputs, program = module.context( inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len) trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use accuracy as metrics # Use the appropriate tokenizer to preprocess the data set
# Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC # For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
# metric should be acc, f1 or matthews if module.name == "ernie_tiny":
dataset = hub.dataset.ChnSentiCorp() tokenizer = hub.ErnieTinyTokenizer(
metrics_choices = ["acc"] vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
# For ernie_tiny, it use sub-word to tokenize chinese sentence dataset = hub.dataset.ChnSentiCorp(
# If not ernie tiny, sp_model_path and word_dict_path should be set None tokenizer=tokenizer, max_seq_len=args.max_seq_len)
reader = hub.reader.ClassifyReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len,
sp_model_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
# Construct transfer learning network # Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence. # Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output. # Use "sequence_output" for token-level output.
token_feature = outputs["sequence_output"] token_feature = outputs["sequence_output"]
# Setup feed list for data feeder
# Must feed all the tensor of module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Select fine-tune strategy, setup config and fine-tune # Select fine-tune strategy, setup config and fine-tune
strategy = hub.AdamWeightDecayStrategy( strategy = hub.AdamWeightDecayStrategy(
warmup_proportion=args.warmup_proportion, warmup_proportion=args.warmup_proportion,
...@@ -90,13 +79,12 @@ if __name__ == '__main__': ...@@ -90,13 +79,12 @@ if __name__ == '__main__':
# you must use the outputs["sequence_output"] as the token_feature of TextClassifierTask, # you must use the outputs["sequence_output"] as the token_feature of TextClassifierTask,
# rather than outputs["pooled_output"], and feature is None # rather than outputs["pooled_output"], and feature is None
cls_task = hub.TextClassifierTask( cls_task = hub.TextClassifierTask(
data_reader=reader, dataset=dataset,
token_feature=token_feature, token_feature=token_feature,
feed_list=feed_list,
network=args.network, network=args.network,
num_classes=dataset.num_labels, num_classes=dataset.num_labels,
config=config, config=config,
metrics_choices=metrics_choices) metrics_choices=["acc"])
# Fine-tune and evaluate by PaddleHub's API # Fine-tune and evaluate by PaddleHub's API
# will finish training, evaluation, testing, save model automatically # will finish training, evaluation, testing, save model automatically
......
...@@ -31,6 +31,7 @@ from . import dataset ...@@ -31,6 +31,7 @@ from . import dataset
from . import finetune from . import finetune
from . import reader from . import reader
from . import network from . import network
from . import tokenizer
from .common.dir import USER_HOME from .common.dir import USER_HOME
from .common.dir import HUB_HOME from .common.dir import HUB_HOME
...@@ -70,3 +71,6 @@ from .finetune.strategy import CombinedStrategy ...@@ -70,3 +71,6 @@ from .finetune.strategy import CombinedStrategy
from .autofinetune.evaluator import report_final_result from .autofinetune.evaluator import report_final_result
from .module.nlp_module import NLPPredictionModule, TransformerModule from .module.nlp_module import NLPPredictionModule, TransformerModule
from .tokenizer.bert_tokenizer import BertTokenizer
from .tokenizer.bert_tokenizer import ErnieTinyTokenizer
...@@ -20,11 +20,16 @@ from __future__ import print_function ...@@ -20,11 +20,16 @@ from __future__ import print_function
import os import os
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
class BQ(BaseNLPDataset): class BQ(TextClassificationDataset):
def __init__(self): """
The Bank Question (BQ) corpus, a Chinese corpus for sentence semantic equivalence identification (SSEI),
contains 120,000 question pairs from 1-year online bank custom service logs.
"""
def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "bq") dataset_dir = os.path.join(DATA_HOME, "bq")
base_path = self._download_dataset( base_path = self._download_dataset(
dataset_dir, dataset_dir,
...@@ -36,18 +41,16 @@ class BQ(BaseNLPDataset): ...@@ -36,18 +41,16 @@ class BQ(BaseNLPDataset):
test_file="test.txt", test_file="test.txt",
label_file=None, label_file=None,
label_list=["0", "1"], label_list=["0", "1"],
) tokenizer=tokenizer,
max_seq_len=max_seq_len)
if __name__ == "__main__": if __name__ == "__main__":
ds = BQ() from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
print("first 10 dev") ds = BQ(tokenizer=BertTokenizer(vocab_file='vocab.txt'), max_seq_len=10)
print("first 10 dev examples")
for e in ds.get_dev_examples()[:10]: for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print("first 10 train") print("first 10 dev records")
for e in ds.get_train_examples()[:10]: for e in ds.get_dev_records()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print(e)
print("first 10 test")
for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds)
...@@ -23,16 +23,16 @@ import csv ...@@ -23,16 +23,16 @@ import csv
from paddlehub.dataset import InputExample from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
class ChnSentiCorp(BaseNLPDataset): class ChnSentiCorp(TextClassificationDataset):
""" """
ChnSentiCorp (by Tan Songbo at ICT of Chinese Academy of Sciences, and for ChnSentiCorp (by Tan Songbo at ICT of Chinese Academy of Sciences, and for
opinion mining) opinion mining)
""" """
def __init__(self): def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "chnsenticorp") dataset_dir = os.path.join(DATA_HOME, "chnsenticorp")
base_path = self._download_dataset( base_path = self._download_dataset(
dataset_dir, dataset_dir,
...@@ -44,7 +44,8 @@ class ChnSentiCorp(BaseNLPDataset): ...@@ -44,7 +44,8 @@ class ChnSentiCorp(BaseNLPDataset):
test_file="test.tsv", test_file="test.tsv",
label_file=None, label_file=None,
label_list=["0", "1"], label_list=["0", "1"],
) tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None): def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file.""" """Reads a tab separated value file."""
...@@ -63,6 +64,13 @@ class ChnSentiCorp(BaseNLPDataset): ...@@ -63,6 +64,13 @@ class ChnSentiCorp(BaseNLPDataset):
if __name__ == "__main__": if __name__ == "__main__":
ds = ChnSentiCorp() from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
for e in ds.get_train_examples()[:10]: tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = ChnSentiCorp(tokenizer=tokenizer, max_seq_len=10)
print("first 10 dev examples")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
...@@ -20,7 +20,7 @@ import os ...@@ -20,7 +20,7 @@ import os
from paddlehub.reader import tokenization from paddlehub.reader import tokenization
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset from paddlehub.dataset.base_nlp_dataset import MRCDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/cmrc2018.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/cmrc2018.tar.gz"
SPIECE_UNDERLINE = '▁' SPIECE_UNDERLINE = '▁'
...@@ -62,10 +62,14 @@ class CMRC2018Example(object): ...@@ -62,10 +62,14 @@ class CMRC2018Example(object):
return s return s
class CMRC2018(BaseNLPDataset): class CMRC2018(MRCDataset):
"""A single set of features of data.""" """A single set of features of data."""
def __init__(self): def __init__(self,
tokenizer=None,
max_seq_len=None,
max_query_len=64,
doc_stride=128):
dataset_dir = os.path.join(DATA_HOME, "cmrc2018") dataset_dir = os.path.join(DATA_HOME, "cmrc2018")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL) base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(CMRC2018, self).__init__( super(CMRC2018, self).__init__(
...@@ -75,6 +79,10 @@ class CMRC2018(BaseNLPDataset): ...@@ -75,6 +79,10 @@ class CMRC2018(BaseNLPDataset):
test_file=None, test_file=None,
label_file=None, label_file=None,
label_list=None, label_list=None,
tokenizer=tokenizer,
max_seq_len=max_seq_len,
max_query_len=max_query_len,
doc_stride=doc_stride,
) )
def _read_file(self, input_file, phase=False): def _read_file(self, input_file, phase=False):
...@@ -201,7 +209,9 @@ class CMRC2018(BaseNLPDataset): ...@@ -201,7 +209,9 @@ class CMRC2018(BaseNLPDataset):
if __name__ == "__main__": if __name__ == "__main__":
print("begin") print("begin")
ds = CMRC2018() from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = CMRC2018(tokenizer=tokenizer, max_seq_len=50)
print("train") print("train")
examples = ds.get_train_examples() examples = ds.get_train_examples()
for index, e in enumerate(examples): for index, e in enumerate(examples):
......
...@@ -121,6 +121,20 @@ class BaseDataset(object): ...@@ -121,6 +121,20 @@ class BaseDataset(object):
def get_predict_examples(self): def get_predict_examples(self):
return self.predict_examples return self.predict_examples
def get_examples(self, phase):
if phase == "train":
return self.get_train_examples()
elif phase == "dev":
return self.get_dev_examples()
elif phase == "test":
return self.get_test_examples()
elif phase == "val":
return self.get_val_examples()
elif phase == "predict":
return self.get_predict_examples()
else:
raise ValueError("Invalid phase: %s" % phase)
def get_labels(self): def get_labels(self):
return self.label_list return self.label_list
......
...@@ -20,7 +20,7 @@ import os ...@@ -20,7 +20,7 @@ import os
from paddlehub.reader import tokenization from paddlehub.reader import tokenization
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset from paddlehub.dataset.base_nlp_dataset import MRCDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/drcd.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/drcd.tar.gz"
SPIECE_UNDERLINE = '▁' SPIECE_UNDERLINE = '▁'
...@@ -62,10 +62,16 @@ class DRCDExample(object): ...@@ -62,10 +62,16 @@ class DRCDExample(object):
return s return s
class DRCD(BaseNLPDataset): class DRCD(MRCDataset):
"""A single set of features of data.""" """A single set of features of data."""
def __init__(self): def __init__(
self,
tokenizer=None,
max_seq_len=None,
max_query_len=64,
doc_stride=128,
):
dataset_dir = os.path.join(DATA_HOME, "drcd") dataset_dir = os.path.join(DATA_HOME, "drcd")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL) base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(DRCD, self).__init__( super(DRCD, self).__init__(
...@@ -75,6 +81,10 @@ class DRCD(BaseNLPDataset): ...@@ -75,6 +81,10 @@ class DRCD(BaseNLPDataset):
test_file="DRCD_test.json", test_file="DRCD_test.json",
label_file=None, label_file=None,
label_list=None, label_list=None,
tokenizer=tokenizer,
max_seq_len=max_seq_len,
max_query_len=max_query_len,
doc_stride=doc_stride,
) )
def _read_file(self, input_file, phase=None): def _read_file(self, input_file, phase=None):
...@@ -176,8 +186,8 @@ class DRCD(BaseNLPDataset): ...@@ -176,8 +186,8 @@ class DRCD(BaseNLPDataset):
cleaned_answer_text = "".join( cleaned_answer_text = "".join(
tokenization.whitespace_tokenize(orig_answer_text)) tokenization.whitespace_tokenize(orig_answer_text))
if actual_text.find(cleaned_answer_text) == -1: if actual_text.find(cleaned_answer_text) == -1:
logger.warning((actual_text, " vs ", logger.warning("Could not find answer: '%s' vs. '%s'" %
cleaned_answer_text, " in ", qa)) (actual_text, cleaned_answer_text))
continue continue
example = DRCDExample( example = DRCDExample(
qas_id=qas_id, qas_id=qas_id,
...@@ -191,7 +201,9 @@ class DRCD(BaseNLPDataset): ...@@ -191,7 +201,9 @@ class DRCD(BaseNLPDataset):
if __name__ == "__main__": if __name__ == "__main__":
ds = DRCD() from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = DRCD(tokenizer=tokenizer, max_seq_len=50)
print("train") print("train")
examples = ds.get_train_examples() examples = ds.get_train_examples()
for index, e in enumerate(examples): for index, e in enumerate(examples):
......
...@@ -36,7 +36,7 @@ class GLUE(BaseNLPDataset): ...@@ -36,7 +36,7 @@ class GLUE(BaseNLPDataset):
for more information for more information
""" """
def __init__(self, sub_dataset='SST-2'): def __init__(self, sub_dataset='SST-2', tokenizer=None, max_seq_len=None):
# sub_dataset : CoLA, MNLI, MRPC, QNLI, QQP, RTE, SST-2, STS-B # sub_dataset : CoLA, MNLI, MRPC, QNLI, QQP, RTE, SST-2, STS-B
if sub_dataset not in [ if sub_dataset not in [
'CoLA', 'MNLI', 'MNLI_m', 'MNLI_mm', 'MRPC', 'QNLI', 'QQP', 'CoLA', 'MNLI', 'MNLI_m', 'MNLI_mm', 'MRPC', 'QNLI', 'QQP',
...@@ -85,7 +85,8 @@ class GLUE(BaseNLPDataset): ...@@ -85,7 +85,8 @@ class GLUE(BaseNLPDataset):
predict_file=predict_file, predict_file=predict_file,
label_file=None, label_file=None,
label_list=label_list, label_list=label_list,
) tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None): def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file.""" """Reads a tab separated value file."""
...@@ -165,11 +166,13 @@ class GLUE(BaseNLPDataset): ...@@ -165,11 +166,13 @@ class GLUE(BaseNLPDataset):
if __name__ == "__main__": if __name__ == "__main__":
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
for sub_dataset in [ for sub_dataset in [
'CoLA', 'MNLI', 'MRPC', 'QNLI', 'QQP', 'RTE', 'SST-2', 'STS-B' 'CoLA', 'MNLI', 'MRPC', 'QNLI', 'QQP', 'RTE', 'SST-2', 'STS-B'
]: ]:
print(sub_dataset) print(sub_dataset)
ds = GLUE(sub_dataset=sub_dataset) ds = GLUE(sub_dataset=sub_dataset, tokenizer=tokenizer, max_seq_len=10)
for e in ds.get_train_examples()[:2]: for e in ds.get_train_examples()[:2]:
print(e) print(e)
print() print()
...@@ -182,3 +185,6 @@ if __name__ == "__main__": ...@@ -182,3 +185,6 @@ if __name__ == "__main__":
for e in ds.get_predict_examples()[:2]: for e in ds.get_predict_examples()[:2]:
print(e) print(e)
print() print()
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
...@@ -22,13 +22,13 @@ import os ...@@ -22,13 +22,13 @@ import os
from paddlehub.dataset import InputExample from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/iflytek.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/iflytek.tar.gz"
class IFLYTEK(BaseNLPDataset): class IFLYTEK(TextClassificationDataset):
def __init__(self): def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "iflytek") dataset_dir = os.path.join(DATA_HOME, "iflytek")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL) base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(IFLYTEK, self).__init__( super(IFLYTEK, self).__init__(
...@@ -38,7 +38,8 @@ class IFLYTEK(BaseNLPDataset): ...@@ -38,7 +38,8 @@ class IFLYTEK(BaseNLPDataset):
test_file="test.txt", test_file="test.txt",
label_file=None, label_file=None,
label_list=[str(i) for i in range(119)], label_list=[str(i) for i in range(119)],
) tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None): def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file.""" """Reads a tab separated value file."""
...@@ -56,7 +57,9 @@ class IFLYTEK(BaseNLPDataset): ...@@ -56,7 +57,9 @@ class IFLYTEK(BaseNLPDataset):
if __name__ == "__main__": if __name__ == "__main__":
ds = IFLYTEK() from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = IFLYTEK(tokenizer=tokenizer, max_seq_len=10)
print("first 10 dev") print("first 10 dev")
for e in ds.get_dev_examples()[:10]: for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
...@@ -67,3 +70,6 @@ if __name__ == "__main__": ...@@ -67,3 +70,6 @@ if __name__ == "__main__":
for e in ds.get_test_examples()[:10]: for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds) print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
...@@ -23,17 +23,17 @@ import csv ...@@ -23,17 +23,17 @@ import csv
from paddlehub.dataset import InputExample from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/inews.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/inews.tar.gz"
class INews(BaseNLPDataset): class INews(TextClassificationDataset):
""" """
INews is a sentiment analysis dataset for Internet News INews is a sentiment analysis dataset for Internet News
""" """
def __init__(self): def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "inews") dataset_dir = os.path.join(DATA_HOME, "inews")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL) base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(INews, self).__init__( super(INews, self).__init__(
...@@ -43,7 +43,8 @@ class INews(BaseNLPDataset): ...@@ -43,7 +43,8 @@ class INews(BaseNLPDataset):
test_file="test.txt", test_file="test.txt",
label_file=None, label_file=None,
label_list=["0", "1", "2"], label_list=["0", "1", "2"],
) tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None): def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file.""" """Reads a tab separated value file."""
...@@ -60,7 +61,10 @@ class INews(BaseNLPDataset): ...@@ -60,7 +61,10 @@ class INews(BaseNLPDataset):
if __name__ == "__main__": if __name__ == "__main__":
ds = INews() from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = INews(tokenizer=tokenizer, max_seq_len=10)
print("first 10 dev") print("first 10 dev")
for e in ds.get_dev_examples()[:10]: for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
...@@ -71,3 +75,6 @@ if __name__ == "__main__": ...@@ -71,3 +75,6 @@ if __name__ == "__main__":
for e in ds.get_test_examples()[:10]: for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds) print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
...@@ -23,13 +23,13 @@ import csv ...@@ -23,13 +23,13 @@ import csv
from paddlehub.dataset import InputExample from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/lcqmc.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/lcqmc.tar.gz"
class LCQMC(BaseNLPDataset): class LCQMC(TextClassificationDataset):
def __init__(self): def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "lcqmc") dataset_dir = os.path.join(DATA_HOME, "lcqmc")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL) base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(LCQMC, self).__init__( super(LCQMC, self).__init__(
...@@ -39,7 +39,8 @@ class LCQMC(BaseNLPDataset): ...@@ -39,7 +39,8 @@ class LCQMC(BaseNLPDataset):
test_file="test.tsv", test_file="test.tsv",
label_file=None, label_file=None,
label_list=["0", "1"], label_list=["0", "1"],
) tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None): def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file.""" """Reads a tab separated value file."""
...@@ -58,7 +59,10 @@ class LCQMC(BaseNLPDataset): ...@@ -58,7 +59,10 @@ class LCQMC(BaseNLPDataset):
if __name__ == "__main__": if __name__ == "__main__":
ds = LCQMC() from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = LCQMC(tokenizer=tokenizer, max_seq_len=512)
print("first 10 dev") print("first 10 dev")
for e in ds.get_dev_examples()[:10]: for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
...@@ -69,3 +73,7 @@ if __name__ == "__main__": ...@@ -69,3 +73,7 @@ if __name__ == "__main__":
for e in ds.get_test_examples()[:10]: for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds) print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
...@@ -23,12 +23,12 @@ import csv ...@@ -23,12 +23,12 @@ import csv
from paddlehub.dataset import InputExample from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset from paddlehub.dataset.base_nlp_dataset import SeqLabelingDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/msra_ner.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/msra_ner.tar.gz"
class MSRA_NER(BaseNLPDataset): class MSRA_NER(SeqLabelingDataset):
""" """
A set of manually annotated Chinese word-segmentation data and A set of manually annotated Chinese word-segmentation data and
specifications for training and testing a Chinese word-segmentation system specifications for training and testing a Chinese word-segmentation system
...@@ -36,7 +36,7 @@ class MSRA_NER(BaseNLPDataset): ...@@ -36,7 +36,7 @@ class MSRA_NER(BaseNLPDataset):
https://www.microsoft.com/en-us/download/details.aspx?id=52531 https://www.microsoft.com/en-us/download/details.aspx?id=52531
""" """
def __init__(self): def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "msra_ner") dataset_dir = os.path.join(DATA_HOME, "msra_ner")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL) base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(MSRA_NER, self).__init__( super(MSRA_NER, self).__init__(
...@@ -48,7 +48,8 @@ class MSRA_NER(BaseNLPDataset): ...@@ -48,7 +48,8 @@ class MSRA_NER(BaseNLPDataset):
label_list=[ label_list=[
"B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "O" "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "O"
], ],
) tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None): def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file.""" """Reads a tab separated value file."""
...@@ -67,7 +68,9 @@ class MSRA_NER(BaseNLPDataset): ...@@ -67,7 +68,9 @@ class MSRA_NER(BaseNLPDataset):
if __name__ == "__main__": if __name__ == "__main__":
ds = MSRA_NER() from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = MSRA_NER(tokenizer=tokenizer, max_seq_len=30)
print("first 10 dev") print("first 10 dev")
for e in ds.get_dev_examples()[:10]: for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
...@@ -78,3 +81,6 @@ if __name__ == "__main__": ...@@ -78,3 +81,6 @@ if __name__ == "__main__":
for e in ds.get_test_examples()[:10]: for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds) print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
...@@ -23,19 +23,19 @@ import csv ...@@ -23,19 +23,19 @@ import csv
from paddlehub.dataset import InputExample from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/nlpcc-dbqa.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/nlpcc-dbqa.tar.gz"
class NLPCC_DBQA(BaseNLPDataset): class NLPCC_DBQA(TextClassificationDataset):
""" """
Please refer to Please refer to
http://tcci.ccf.org.cn/conference/2017/dldoc/taskgline05.pdf http://tcci.ccf.org.cn/conference/2017/dldoc/taskgline05.pdf
for more information for more information
""" """
def __init__(self): def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "nlpcc-dbqa") dataset_dir = os.path.join(DATA_HOME, "nlpcc-dbqa")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL) base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(NLPCC_DBQA, self).__init__( super(NLPCC_DBQA, self).__init__(
...@@ -45,7 +45,8 @@ class NLPCC_DBQA(BaseNLPDataset): ...@@ -45,7 +45,8 @@ class NLPCC_DBQA(BaseNLPDataset):
test_file="test.tsv", test_file="test.tsv",
label_file=None, label_file=None,
label_list=["0", "1"], label_list=["0", "1"],
) tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None): def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file.""" """Reads a tab separated value file."""
...@@ -64,7 +65,9 @@ class NLPCC_DBQA(BaseNLPDataset): ...@@ -64,7 +65,9 @@ class NLPCC_DBQA(BaseNLPDataset):
if __name__ == "__main__": if __name__ == "__main__":
ds = NLPCC_DBQA() from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = NLPCC_DBQA(tokenizer=tokenizer, max_seq_len=10)
print("first 10 dev") print("first 10 dev")
for e in ds.get_dev_examples()[:10]: for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
...@@ -75,3 +78,6 @@ if __name__ == "__main__": ...@@ -75,3 +78,6 @@ if __name__ == "__main__":
for e in ds.get_test_examples()[:10]: for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds) print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
...@@ -20,7 +20,7 @@ import os ...@@ -20,7 +20,7 @@ import os
from paddlehub.reader import tokenization from paddlehub.reader import tokenization
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset from paddlehub.dataset.base_nlp_dataset import MRCDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/squad.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/squad.tar.gz"
...@@ -65,10 +65,17 @@ class SquadExample(object): ...@@ -65,10 +65,17 @@ class SquadExample(object):
return s return s
class SQUAD(BaseNLPDataset): class SQUAD(MRCDataset):
"""A single set of features of data.""" """A single set of features of data."""
def __init__(self, version_2_with_negative=False): def __init__(
self,
version_2_with_negative=False,
tokenizer=None,
max_seq_len=None,
max_query_len=64,
doc_stride=128,
):
self.version_2_with_negative = version_2_with_negative self.version_2_with_negative = version_2_with_negative
if not version_2_with_negative: if not version_2_with_negative:
train_file = "train-v1.1.json" train_file = "train-v1.1.json"
...@@ -87,6 +94,10 @@ class SQUAD(BaseNLPDataset): ...@@ -87,6 +94,10 @@ class SQUAD(BaseNLPDataset):
test_file=None, test_file=None,
label_file=None, label_file=None,
label_list=None, label_list=None,
tokenizer=tokenizer,
max_seq_len=max_seq_len,
max_query_len=max_query_len,
doc_stride=doc_stride,
) )
def _read_file(self, input_file, phase=None): def _read_file(self, input_file, phase=None):
...@@ -177,7 +188,10 @@ class SQUAD(BaseNLPDataset): ...@@ -177,7 +188,10 @@ class SQUAD(BaseNLPDataset):
if __name__ == "__main__": if __name__ == "__main__":
ds = SQUAD(version_2_with_negative=True) from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = SQUAD(
version_2_with_negative=True, tokenizer=tokenizer, max_seq_len=512)
print("first 10 dev") print("first 10 dev")
for e in ds.get_dev_examples()[:2]: for e in ds.get_dev_examples()[:2]:
print(e) print(e)
......
...@@ -22,13 +22,13 @@ import os ...@@ -22,13 +22,13 @@ import os
from paddlehub.dataset import InputExample from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/thucnews.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/thucnews.tar.gz"
class THUCNEWS(BaseNLPDataset): class THUCNEWS(TextClassificationDataset):
def __init__(self): def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "thucnews") dataset_dir = os.path.join(DATA_HOME, "thucnews")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL) base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(THUCNEWS, self).__init__( super(THUCNEWS, self).__init__(
...@@ -38,7 +38,8 @@ class THUCNEWS(BaseNLPDataset): ...@@ -38,7 +38,8 @@ class THUCNEWS(BaseNLPDataset):
test_file="test.txt", test_file="test.txt",
label_file=None, label_file=None,
label_list=[str(i) for i in range(14)], label_list=[str(i) for i in range(14)],
) tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None): def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file.""" """Reads a tab separated value file."""
...@@ -56,7 +57,9 @@ class THUCNEWS(BaseNLPDataset): ...@@ -56,7 +57,9 @@ class THUCNEWS(BaseNLPDataset):
if __name__ == "__main__": if __name__ == "__main__":
ds = THUCNEWS() from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = THUCNEWS(tokenizer=tokenizer, max_seq_len=10)
print("first 10 dev") print("first 10 dev")
for e in ds.get_dev_examples()[:10]: for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
...@@ -67,3 +70,6 @@ if __name__ == "__main__": ...@@ -67,3 +70,6 @@ if __name__ == "__main__":
for e in ds.get_test_examples()[:10]: for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds) print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
...@@ -20,7 +20,8 @@ from __future__ import print_function ...@@ -20,7 +20,8 @@ from __future__ import print_function
import io import io
import os import os
from paddlehub.dataset import InputExample, BaseDataset from paddlehub.dataset import InputExample
from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/tnews.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/tnews.tar.gz"
...@@ -44,12 +45,12 @@ LABEL_NAME = { ...@@ -44,12 +45,12 @@ LABEL_NAME = {
} }
class TNews(BaseDataset): class TNews(TextClassificationDataset):
""" """
TNews is the chinese news classification dataset on Jinri Toutiao App. TNews is the chinese news classification dataset on Jinri Toutiao App.
""" """
def __init__(self): def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "tnews") dataset_dir = os.path.join(DATA_HOME, "tnews")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL) base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
label_list = [ label_list = [
...@@ -63,7 +64,8 @@ class TNews(BaseDataset): ...@@ -63,7 +64,8 @@ class TNews(BaseDataset):
test_file="toutiao_category_test.txt", test_file="toutiao_category_test.txt",
label_file=None, label_file=None,
label_list=label_list, label_list=label_list,
) tokenizer=tokenizer,
max_seq_len=max_seq_len)
def get_label_name(self, id): def get_label_name(self, id):
return LABEL_NAME[id] return LABEL_NAME[id]
...@@ -82,7 +84,9 @@ class TNews(BaseDataset): ...@@ -82,7 +84,9 @@ class TNews(BaseDataset):
if __name__ == "__main__": if __name__ == "__main__":
ds = TNews() from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = TNews(tokenizer=tokenizer, max_seq_len=10)
print("first 10 dev") print("first 10 dev")
for e in ds.get_dev_examples()[:10]: for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
...@@ -93,3 +97,6 @@ if __name__ == "__main__": ...@@ -93,3 +97,6 @@ if __name__ == "__main__":
for e in ds.get_test_examples()[:10]: for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds) print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
...@@ -22,18 +22,18 @@ import pandas as pd ...@@ -22,18 +22,18 @@ import pandas as pd
from paddlehub.dataset import InputExample from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset from paddlehub.dataset.base_nlp_dataset import MultiLabelDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/toxic.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/toxic.tar.gz"
class Toxic(BaseNLPDataset): class Toxic(MultiLabelDataset):
""" """
The kaggle Toxic dataset: The kaggle Toxic dataset:
https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge
""" """
def __init__(self): def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "toxic") dataset_dir = os.path.join(DATA_HOME, "toxic")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL) base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
label_list = [ label_list = [
...@@ -47,7 +47,8 @@ class Toxic(BaseNLPDataset): ...@@ -47,7 +47,8 @@ class Toxic(BaseNLPDataset):
test_file="test.csv", test_file="test.csv",
label_file=None, label_file=None,
label_list=label_list, label_list=label_list,
) tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None): def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file.""" """Reads a tab separated value file."""
...@@ -64,7 +65,10 @@ class Toxic(BaseNLPDataset): ...@@ -64,7 +65,10 @@ class Toxic(BaseNLPDataset):
if __name__ == "__main__": if __name__ == "__main__":
ds = Toxic() from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = Toxic(tokenizer=tokenizer, max_seq_len=10)
print("first 10 dev") print("first 10 dev")
for e in ds.get_dev_examples()[:10]: for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
...@@ -75,3 +79,6 @@ if __name__ == "__main__": ...@@ -75,3 +79,6 @@ if __name__ == "__main__":
for e in ds.get_test_examples()[:10]: for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds) print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
...@@ -25,19 +25,19 @@ import csv ...@@ -25,19 +25,19 @@ import csv
from paddlehub.dataset import InputExample from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/XNLI-lan.tar.gz" _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/XNLI-lan.tar.gz"
class XNLI(BaseNLPDataset): class XNLI(TextClassificationDataset):
""" """
Please refer to Please refer to
https://arxiv.org/pdf/1809.05053.pdf https://arxiv.org/pdf/1809.05053.pdf
for more information for more information
""" """
def __init__(self, language='zh'): def __init__(self, language='zh', tokenizer=None, max_seq_len=None):
if language not in [ if language not in [
"ar", "bg", "de", "el", "en", "es", "fr", "hi", "ru", "sw", "ar", "bg", "de", "el", "en", "es", "fr", "hi", "ru", "sw",
"th", "tr", "ur", "vi", "zh" "th", "tr", "ur", "vi", "zh"
...@@ -55,6 +55,8 @@ class XNLI(BaseNLPDataset): ...@@ -55,6 +55,8 @@ class XNLI(BaseNLPDataset):
test_file="%s_test.tsv" % language, test_file="%s_test.tsv" % language,
label_file=None, label_file=None,
label_list=["neutral", "contradiction", "entailment"], label_list=["neutral", "contradiction", "entailment"],
tokenizer=tokenizer,
max_seq_len=max_seq_len,
) )
def _read_file(self, input_file, phase=None): def _read_file(self, input_file, phase=None):
...@@ -74,7 +76,10 @@ class XNLI(BaseNLPDataset): ...@@ -74,7 +76,10 @@ class XNLI(BaseNLPDataset):
if __name__ == "__main__": if __name__ == "__main__":
ds = XNLI() from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = XNLI(tokenizer=tokenizer, max_seq_len=20)
print("first 10 dev") print("first 10 dev")
for e in ds.get_dev_examples()[:10]: for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label)) print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
......
...@@ -167,7 +167,7 @@ class DefaultStrategy(object): ...@@ -167,7 +167,7 @@ class DefaultStrategy(object):
self.optimizer = fluid.optimizer.Adam( self.optimizer = fluid.optimizer.Adam(
learning_rate=self.learning_rate, **kwargs) learning_rate=self.learning_rate, **kwargs)
def execute(self, loss, data_reader, config, dev_count): def execute(self, loss, max_train_steps):
if self.optimizer is not None: if self.optimizer is not None:
self.optimizer.minimize(loss) self.optimizer.minimize(loss)
else: else:
...@@ -456,26 +456,9 @@ class CombinedStrategy(DefaultStrategy): ...@@ -456,26 +456,9 @@ class CombinedStrategy(DefaultStrategy):
"weight_decay"] * scheduled_lr "weight_decay"] * scheduled_lr
fluid.layers.assign(output=param, input=updated_param) fluid.layers.assign(output=param, input=updated_param)
def execute(self, loss, data_reader, config, dev_count): def execute(self, loss, max_train_steps):
# base information # base information
self.main_program = loss.block.program self.main_program = loss.block.program
self.config = config
# self.num_examples = {'train': -1, 'dev': -1, 'test': -1} before data_generator
data_reader.data_generator(
batch_size=config.batch_size, phase='train', shuffle=True)
num_train_examples = data_reader.num_examples['train']
max_train_steps = config.num_epoch * num_train_examples // config.batch_size // dev_count
try:
# nlp_reader
_in_tokens = data_reader.in_tokens
if _in_tokens:
max_train_steps *= data_reader.max_seq_len
except:
# cv_reader without .in_tokens and .max_seq_len
pass
if self.scheduler["discriminative"]["blocks"] > 0 or self.scheduler[ if self.scheduler["discriminative"]["blocks"] > 0 or self.scheduler[
"gradual_unfreeze"]["blocks"] > 0: "gradual_unfreeze"]["blocks"] > 0:
...@@ -494,8 +477,7 @@ class CombinedStrategy(DefaultStrategy): ...@@ -494,8 +477,7 @@ class CombinedStrategy(DefaultStrategy):
self.regularization_handler(loss, scheduled_lr) self.regularization_handler(loss, scheduled_lr)
logger.info(self.__str__()) logger.info(self.__str__())
return scheduled_lr
return scheduled_lr, max_train_steps
def exclude_from_weight_decay(self, name): def exclude_from_weight_decay(self, name):
if name.find("layer_norm") > -1: if name.find("layer_norm") > -1:
......
...@@ -35,6 +35,7 @@ import paddle.fluid as fluid ...@@ -35,6 +35,7 @@ import paddle.fluid as fluid
from visualdl import LogWriter from visualdl import LogWriter
import paddlehub as hub import paddlehub as hub
from paddlehub.reader.nlp_reader import BaseNLPReader
from paddlehub.common.paddle_helper import dtype_map, clone_program from paddlehub.common.paddle_helper import dtype_map, clone_program
from paddlehub.common.utils import mkdir from paddlehub.common.utils import mkdir
from paddlehub.common.dir import tmp_dir from paddlehub.common.dir import tmp_dir
...@@ -84,7 +85,7 @@ class RunEnv(object): ...@@ -84,7 +85,7 @@ class RunEnv(object):
self.start_program = None self.start_program = None
self.main_program_compiled = None self.main_program_compiled = None
self.py_reader = None self.py_reader = None
self.reader = None self.generator = None
self.loss = None self.loss = None
self.labels = None self.labels = None
self.metrics = None self.metrics = None
...@@ -260,8 +261,8 @@ class BaseTask(object): ...@@ -260,8 +261,8 @@ class BaseTask(object):
BaseTask is the base class of all the task. It will complete the building of all the running environment. BaseTask is the base class of all the task. It will complete the building of all the running environment.
Args: Args:
feed_list (list): the inputs name feed_list (list): the inputs name. Deprecated in paddlehub v1.8.
data_reader (object): data reader for the task data_reader (object): data reader for the task. Deprecated in paddlehub v1.8.
main_program (object): the customized main_program, default None main_program (object): the customized main_program, default None
startup_program (object): the customized startup_program, default None startup_program (object): the customized startup_program, default None
config (object): the config for the task, default None config (object): the config for the task, default None
...@@ -269,16 +270,13 @@ class BaseTask(object): ...@@ -269,16 +270,13 @@ class BaseTask(object):
""" """
def __init__(self, def __init__(self,
feed_list, dataset=None,
data_reader, feed_list=None,
data_reader=None,
main_program=None, main_program=None,
startup_program=None, startup_program=None,
config=None, config=None,
metrics_choices="default"): metrics_choices="default"):
# base item
self._base_data_reader = data_reader
self._base_feed_list = feed_list
# metrics item # metrics item
self.best_score = -999 self.best_score = -999
if metrics_choices == "default": if metrics_choices == "default":
...@@ -293,7 +291,6 @@ class BaseTask(object): ...@@ -293,7 +291,6 @@ class BaseTask(object):
if main_program is None: if main_program is None:
self._base_main_program = clone_program( self._base_main_program = clone_program(
fluid.default_main_program(), for_test=False) fluid.default_main_program(), for_test=False)
else: else:
self._base_main_program = clone_program( self._base_main_program = clone_program(
main_program, for_test=False) main_program, for_test=False)
...@@ -344,6 +341,23 @@ class BaseTask(object): ...@@ -344,6 +341,23 @@ class BaseTask(object):
# set default phase # set default phase
self.enter_phase("train") self.enter_phase("train")
self.dataset = dataset
if dataset:
self._label_list = dataset.get_labels()
# Compatible code for usage deprecated in paddlehub v1.8.
self._base_data_reader = data_reader
self._base_feed_list = feed_list
if isinstance(data_reader, BaseNLPReader):
self._compatible_mode = True
logger.warning(
"PaddleHub v1.8 has deprecated the reader and feed_list parameters in the nlp Task. We provided an easier usage, "
"in which you can use your tokenizer to preprocess dataset and run task in a clear flow. "
"New demo see https://github.com/PaddlePaddle/PaddleHub/blob/release/v1.8/demo/text_classification/text_cls.py"
)
else:
self._compatible_mode = False
@contextlib.contextmanager @contextlib.contextmanager
def phase_guard(self, phase): def phase_guard(self, phase):
self.enter_phase(phase) self.enter_phase(phase)
...@@ -420,9 +434,29 @@ class BaseTask(object): ...@@ -420,9 +434,29 @@ class BaseTask(object):
with fluid.program_guard(self.env.main_program, with fluid.program_guard(self.env.main_program,
self._base_startup_program): self._base_startup_program):
with fluid.unique_name.guard(self.env.UNG): with fluid.unique_name.guard(self.env.UNG):
self.scheduled_lr, self.max_train_steps = self.config.strategy.execute( if self._compatible_mode:
self.loss, self._base_data_reader, self.config, # This branch is compatible code for usage deprecated in paddlehub v1.8.
self.device_count) self._base_data_reader.data_generator(
batch_size=self.config.batch_size,
phase='train',
shuffle=True)
num_train_examples = self._base_data_reader.num_examples[
'train']
try:
# nlp_reader
_in_tokens = self._base_data_reader.in_tokens
if _in_tokens:
num_train_examples *= self._base_data_reader.max_seq_len
except:
# cv_reader without .in_tokens and .max_seq_len
pass
else:
num_train_examples = len(
self.dataset.get_train_records())
self.max_train_steps = self.config.num_epoch * num_train_examples // self.config.batch_size // self.device_count
self.scheduled_lr = self.config.strategy.execute(
self.loss, self.max_train_steps)
if self.is_train_phase: if self.is_train_phase:
loss_name = self.env.loss.name loss_name = self.env.loss.name
...@@ -529,17 +563,40 @@ class BaseTask(object): ...@@ -529,17 +563,40 @@ class BaseTask(object):
return self.main_program return self.main_program
@property @property
def reader(self): def generator(self):
if self.is_predict_phase: if self._compatible_mode:
data = self._predict_data if self.is_predict_phase:
data = self._predict_data
else:
data = None
self.env.generator = self._base_data_reader.data_generator(
batch_size=self.config.batch_size,
phase=self.phase,
data=data,
return_list=not self.config.use_pyreader)
else: else:
data = None
self.env.reader = self._base_data_reader.data_generator( def data_generator(records):
batch_size=self.config.batch_size, def wrapper():
phase=self.phase, for record in records:
data=data, values = []
return_list=not self.config.use_pyreader) for feed_name in self.feed_list:
return self.env.reader values.append(record[feed_name])
yield values
return wrapper
if self.is_predict_phase:
records = self._predict_data
else:
if self.is_train_phase:
shuffle = True
else:
shuffle = False
records = self.dataset.get_records(
phase=self.phase, shuffle=shuffle)
self.env.generator = data_generator(records)
return self.env.generator
@property @property
def loss(self): def loss(self):
...@@ -580,13 +637,30 @@ class BaseTask(object): ...@@ -580,13 +637,30 @@ class BaseTask(object):
@property @property
def feed_list(self): def feed_list(self):
feed_list = [varname for varname in self._base_feed_list] if self._compatible_mode:
if self.is_train_phase or self.is_test_phase: feed_list = [varname for varname in self._base_feed_list]
feed_list += [label.name for label in self.labels] if self.is_train_phase or self.is_test_phase:
feed_list += [label.name for label in self.labels]
else:
if not self.env.is_inititalized:
self._build_env()
if self._predict_data:
feed_list = list(self._predict_data[0].keys())
else:
feed_list = self.dataset.get_feed_list(self.phase)
feed_list = [
feed_name for feed_name in feed_list
if feed_name in self.main_program.global_block().vars
]
return feed_list return feed_list
@property @property
def feed_var_list(self): def feed_var_list(self):
if not self.env.is_inititalized:
self._build_env()
vars = self.main_program.global_block().vars vars = self.main_program.global_block().vars
return [vars[varname] for varname in self.feed_list] return [vars[varname] for varname in self.feed_list]
...@@ -890,13 +964,20 @@ class BaseTask(object): ...@@ -890,13 +964,20 @@ class BaseTask(object):
self.env.current_epoch += 1 self.env.current_epoch += 1
# Final evaluation # Final evaluation
if self._base_data_reader.get_dev_examples() != []: if self._compatible_mode:
dev_examples = self._base_data_reader.get_dev_examples()
test_examples = self._base_data_reader.get_test_examples()
else:
dev_examples = self.dataset.get_dev_examples()
test_examples = self.dataset.get_test_examples()
if dev_examples != []:
# Warning: DO NOT use self.eval(phase="dev", load_best_model=True) during training. # Warning: DO NOT use self.eval(phase="dev", load_best_model=True) during training.
# It will cause trainer unable to continue training from checkpoint after eval. # It will cause trainer unable to continue training from checkpoint after eval.
# More important, The model should evaluate current performance during training. # More important, The model should evaluate current performance during training.
self.eval(phase="dev") self.eval(phase="dev")
if self._base_data_reader.get_test_examples() != []: if test_examples != []:
self.eval(phase="test", load_best_model=True) self.eval(phase="test", load_best_model=True)
# Save checkpoint after finetune # Save checkpoint after finetune
self.save_checkpoint() self.save_checkpoint()
...@@ -957,17 +1038,41 @@ class BaseTask(object): ...@@ -957,17 +1038,41 @@ class BaseTask(object):
global_run_states = [] global_run_states = []
period_run_states = [] period_run_states = []
for run_step, batch in enumerate(self.reader(), start=1): feed_var_shape = []
feed_var_type = []
for var in self.feed_var_list:
feed_var_shape.append(var.shape)
feed_var_type.append(dtype_map[var.dtype])
if self._compatible_mode:
data_reader = self.generator
else:
data_reader = paddle.batch(
self.generator, batch_size=self.config.batch_size)
for batch in data_reader():
if self._compatible_mode and not self.config.use_pyreader:
# if not use pyreader, the nlp_reader return [batch]
batch = batch[0]
step_run_state = RunState(len(self.fetch_list)) step_run_state = RunState(len(self.fetch_list))
step_run_state.run_step = 1 step_run_state.run_step = 1
num_batch_examples = len(batch) num_batch_examples = len(batch)
if not self.config.use_pyreader: # Preocessing data to the suitable shape and type for the model
# if use pyreader, the nlp_reader return [batch] processed_batch = [[] for i in range(len(self.feed_list))]
batch = batch[0] if self._compatible_mode:
processed_batch = batch
batch = [fluid.core.PaddleTensor(data) for data in batch] else:
fetch_result = self._predictor.run(batch) for sample in batch:
for i, data in enumerate(sample):
processed_batch[i].append(data)
tensor_batch = [[] for i in range(len(self.feed_list))]
for i in range(len(processed_batch)):
processed_batch[i] = np.array(processed_batch[i]).reshape(
feed_var_shape[i]).astype(feed_var_type[i])
tensor_batch[i] = fluid.core.PaddleTensor(processed_batch[i])
fetch_result = self._predictor.run(tensor_batch)
for index, result in enumerate(fetch_result): for index, result in enumerate(fetch_result):
step_run_state.run_results[index] = result.as_ndarray() step_run_state.run_results[index] = result.as_ndarray()
step_run_state.run_examples += num_batch_examples step_run_state.run_examples += num_batch_examples
...@@ -978,18 +1083,23 @@ class BaseTask(object): ...@@ -978,18 +1083,23 @@ class BaseTask(object):
global_run_states += period_run_states global_run_states += period_run_states
return global_run_states return global_run_states
def predict(self, def predict(
data, self,
load_best_model=True, data=None,
return_result=False, label_list=None,
accelerate_mode=True): load_best_model=True,
return_result=False,
accelerate_mode=True,
):
""" """
make prediction for the input data. make prediction for the input data.
Args: Args:
data (list): the data will be predicted. data (list): the data will be predicted. Its element should be a record when the task is initialized without data_reader param,
or a plaintext string list when the task is initialized with data_reader param (deprecated in paddlehub v1.8).
label_list (list): the label list, used to proprocess the output.
load_best_model (bool): load the best model or not load_best_model (bool): load the best model or not
return_result (bool): return a readable result or just the raw run result return_result (bool): return a readable result or just the raw run result. Always True when the task is not initialized with data_reader param.
accelerate_mode (bool): use high-performance predictor or not accelerate_mode (bool): use high-performance predictor or not
Returns: Returns:
...@@ -1005,6 +1115,7 @@ class BaseTask(object): ...@@ -1005,6 +1115,7 @@ class BaseTask(object):
with self.phase_guard(phase="predict"): with self.phase_guard(phase="predict"):
self._predict_data = data self._predict_data = data
self._label_list = label_list
self._predict_start_event() self._predict_start_event()
if load_best_model: if load_best_model:
...@@ -1020,7 +1131,7 @@ class BaseTask(object): ...@@ -1020,7 +1131,7 @@ class BaseTask(object):
self._predict_end_event(run_states) self._predict_end_event(run_states)
self._predict_data = None self._predict_data = None
if return_result: if return_result or not self._compatible_mode:
return self._postprocessing(run_states) return self._postprocessing(run_states)
return run_states return run_states
...@@ -1057,20 +1168,34 @@ class BaseTask(object): ...@@ -1057,20 +1168,34 @@ class BaseTask(object):
capacity=64, capacity=64,
use_double_buffer=True, use_double_buffer=True,
iterable=True) iterable=True)
data_reader = data_loader.set_batch_generator( if self._compatible_mode:
self.reader, places=self.places) data_reader = data_loader.set_batch_generator(
self.generator, places=self.places)
else:
data_reader = data_loader.set_sample_generator(
self.generator,
places=self.places,
batch_size=self.config.batch_size,
drop_last=True)
else: else:
data_feeder = fluid.DataFeeder( data_feeder = fluid.DataFeeder(
feed_list=self.feed_list, place=self.place) feed_list=self.feed_list, place=self.place)
data_reader = data_feeder.decorate_reader( if self._compatible_mode:
self.reader, data_reader = data_feeder.decorate_reader(
multi_devices=self.config.use_data_parallel, self.generator,
drop_last=True) multi_devices=self.config.use_data_parallel,
drop_last=True)
else:
data_reader = data_feeder.decorate_reader(
paddle.batch(
self.generator, batch_size=self.config.batch_size),
multi_devices=self.config.use_data_parallel,
drop_last=True)
global_run_states = [] global_run_states = []
period_run_states = [] period_run_states = []
for run_step, batch in enumerate(data_reader(), start=1): for batch in data_reader():
step_run_state = RunState(len(self.fetch_list)) step_run_state = RunState(len(self.fetch_list))
step_run_state.run_step = 1 step_run_state.run_step = 1
num_batch_examples = len(batch) num_batch_examples = len(batch)
...@@ -1107,6 +1232,5 @@ class BaseTask(object): ...@@ -1107,6 +1232,5 @@ class BaseTask(object):
return global_run_states return global_run_states
def __repr__(self): def __repr__(self):
return "Task: %s with metrics_choices: %s, reader: %s, %s" % ( return "Task: %s with metrics_choices: %s, %s" % (
self.__class__.__name__, self.metrics_choices, self.__class__.__name__, self.metrics_choices, self.config)
self._base_data_reader.__class__.__name__, self.config)
...@@ -19,13 +19,12 @@ from __future__ import print_function ...@@ -19,13 +19,12 @@ from __future__ import print_function
from collections import OrderedDict from collections import OrderedDict
import numpy as np import numpy as np
import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import time import time
from paddlehub.common.logger import logger from paddlehub.common.logger import logger
from paddlehub.finetune.evaluate import calculate_f1_np, matthews_corrcoef from paddlehub.finetune.evaluate import calculate_f1_np, matthews_corrcoef
from paddlehub.reader.nlp_reader import ClassifyReader from paddlehub.reader.nlp_reader import ClassifyReader, LACClassifyReader
import paddlehub.network as net import paddlehub.network as net
from .base_task import BaseTask from .base_task import BaseTask
...@@ -35,8 +34,9 @@ class ClassifierTask(BaseTask): ...@@ -35,8 +34,9 @@ class ClassifierTask(BaseTask):
def __init__(self, def __init__(self,
feature, feature,
num_classes, num_classes,
feed_list, dataset=None,
data_reader, feed_list=None,
data_reader=None,
startup_program=None, startup_program=None,
config=None, config=None,
hidden_units=None, hidden_units=None,
...@@ -46,6 +46,7 @@ class ClassifierTask(BaseTask): ...@@ -46,6 +46,7 @@ class ClassifierTask(BaseTask):
main_program = feature.block.program main_program = feature.block.program
super(ClassifierTask, self).__init__( super(ClassifierTask, self).__init__(
dataset=dataset,
data_reader=data_reader, data_reader=data_reader,
main_program=main_program, main_program=main_program,
feed_list=feed_list, feed_list=feed_list,
...@@ -109,7 +110,7 @@ class ClassifierTask(BaseTask): ...@@ -109,7 +110,7 @@ class ClassifierTask(BaseTask):
run_examples += run_state.run_examples run_examples += run_state.run_examples
run_step += run_state.run_step run_step += run_state.run_step
loss_sum += np.mean( loss_sum += np.mean(
run_state.run_results[-2]) * run_state.run_examples run_state.run_results[-1]) * run_state.run_examples
acc_sum += np.mean( acc_sum += np.mean(
run_state.run_results[2]) * run_state.run_examples run_state.run_results[2]) * run_state.run_examples
np_labels = run_state.run_results[0] np_labels = run_state.run_results[0]
...@@ -140,20 +141,28 @@ class ClassifierTask(BaseTask): ...@@ -140,20 +141,28 @@ class ClassifierTask(BaseTask):
return scores, avg_loss, run_speed return scores, avg_loss, run_speed
def _postprocessing(self, run_states): def _postprocessing(self, run_states):
try: if self._compatible_mode:
id2label = { try:
val: key label_list = list(self._base_data_reader.label_map.keys())
for key, val in self._base_data_reader.label_map.items() except:
} raise Exception(
except: "ImageClassificationDataset does not support postprocessing, please use BaseCVDataset instead"
raise Exception( )
"ImageClassificationDataset does not support postprocessing, please use BaseCVDataset instead" else:
) if self._label_list:
label_list = self._label_list
else:
logger.warning(
"Fail to postprocess the predict output. Please set label_list parameter in predict function or initialize the task with dataset parameter."
)
return run_states
results = [] results = []
for batch_state in run_states: for batch_state in run_states:
batch_result = batch_state.run_results batch_result = batch_state.run_results
batch_infer = np.argmax(batch_result[0], axis=1) batch_infer = np.argmax(batch_result[0], axis=1)
results += [id2label[sample_infer] for sample_infer in batch_infer] results += [
label_list[sample_infer] for sample_infer in batch_infer
]
return results return results
...@@ -166,22 +175,24 @@ class TextClassifierTask(ClassifierTask): ...@@ -166,22 +175,24 @@ class TextClassifierTask(ClassifierTask):
It will use full-connect layer with softmax activation function to classify texts. It will use full-connect layer with softmax activation function to classify texts.
""" """
def __init__(self, def __init__(
num_classes, self,
feed_list, num_classes,
data_reader, dataset=None,
feature=None, feed_list=None, # Deprecated
token_feature=None, data_reader=None, # Deprecated
network=None, feature=None,
startup_program=None, token_feature=None,
config=None, network=None,
hidden_units=None, startup_program=None,
metrics_choices="default"): config=None,
hidden_units=None,
metrics_choices="default"):
""" """
Args: Args:
num_classes: total labels of the text classification task. num_classes: total labels of the text classification task.
feed_list(list): the variable name that will be feeded to the main program feed_list(list): the variable name that will be feeded to the main program, Deprecated in paddlehub v1.8.
data_reader(object): data reader for the task. It must be one of ClassifyReader and LACClassifyReader. data_reader(object): data reader for the task. It must be one of ClassifyReader and LACClassifyReader, Deprecated in paddlehub v1.8..
feature(Variable): the `feature` will be used to classify texts. It must be the sentence-level feature, shape as [-1, emb_size]. `Token_feature` and `feature` couldn't be setted at the same time. One of them must be setted as not None. Default None. feature(Variable): the `feature` will be used to classify texts. It must be the sentence-level feature, shape as [-1, emb_size]. `Token_feature` and `feature` couldn't be setted at the same time. One of them must be setted as not None. Default None.
token_feature(Variable): the `feature` will be used to connect the pre-defined network. It must be the token-level feature, shape as [-1, seq_len, emb_size]. Default None. token_feature(Variable): the `feature` will be used to connect the pre-defined network. It must be the token-level feature, shape as [-1, seq_len, emb_size]. Default None.
network(str): the pre-defined network. Choices: 'bilstm', 'bow', 'cnn', 'dpcnn', 'gru' and 'lstm'. Default None. If network is setted, then `token_feature` must be setted and `feature` must be None. network(str): the pre-defined network. Choices: 'bilstm', 'bow', 'cnn', 'dpcnn', 'gru' and 'lstm'. Default None. If network is setted, then `token_feature` must be setted and `feature` must be None.
...@@ -193,12 +204,12 @@ class TextClassifierTask(ClassifierTask): ...@@ -193,12 +204,12 @@ class TextClassifierTask(ClassifierTask):
""" """
if (not feature) and (not token_feature): if (not feature) and (not token_feature):
logger.error( logger.error(
'Both token_feature and feature are None, one of them must be setted.' 'Both token_feature and feature are None, one of them must be set.'
) )
exit(1) exit(1)
elif feature and token_feature: elif feature and token_feature:
logger.error( logger.error(
'Both token_feature and feature are setted. One should be setted, the other should be None.' 'Both token_feature and feature are set. One should be set, the other should be None.'
) )
exit(1) exit(1)
...@@ -226,6 +237,7 @@ class TextClassifierTask(ClassifierTask): ...@@ -226,6 +237,7 @@ class TextClassifierTask(ClassifierTask):
metrics_choices = ["acc"] metrics_choices = ["acc"]
super(TextClassifierTask, self).__init__( super(TextClassifierTask, self).__init__(
dataset=dataset,
data_reader=data_reader, data_reader=data_reader,
feature=feature if feature else token_feature, feature=feature if feature else token_feature,
num_classes=num_classes, num_classes=num_classes,
...@@ -236,16 +248,14 @@ class TextClassifierTask(ClassifierTask): ...@@ -236,16 +248,14 @@ class TextClassifierTask(ClassifierTask):
metrics_choices=metrics_choices) metrics_choices=metrics_choices)
def _build_net(self): def _build_net(self):
if isinstance(self._base_data_reader, ClassifyReader): if not isinstance(self._base_data_reader, LACClassifyReader):
# ClassifyReader will return the seqence length of an input text # LACClassifyReader wont return the seqence length, while Dataset with tokenizer and ClassifyReader will.
self.seq_len = fluid.layers.data( self.seq_len = fluid.layers.data(
name="seq_len", shape=[1], dtype='int64', lod_level=0) name="seq_len", shape=[1], dtype='int64', lod_level=0)
self.seq_len_used = fluid.layers.squeeze(self.seq_len, axes=[1]) self.seq_len_used = fluid.layers.squeeze(self.seq_len, axes=[1])
# unpad the token_feature # unpad the token_feature
unpad_feature = fluid.layers.sequence_unpad( unpad_feature = fluid.layers.sequence_unpad(
self.feature, length=self.seq_len_used) self.feature, length=self.seq_len_used)
if self.network: if self.network:
# add pre-defined net # add pre-defined net
net_func = getattr(net.classification, self.network) net_func = getattr(net.classification, self.network)
...@@ -254,9 +264,14 @@ class TextClassifierTask(ClassifierTask): ...@@ -254,9 +264,14 @@ class TextClassifierTask(ClassifierTask):
cls_feats = net_func( cls_feats = net_func(
self.feature, emb_dim=self.feature.shape[-1]) self.feature, emb_dim=self.feature.shape[-1])
else: else:
cls_feats = net_func(unpad_feature) if self._compatible_mode and isinstance(self._base_data_reader,
logger.info( LACClassifyReader):
"%s has been added in the TextClassifierTask!" % self.network) cls_feats = net_func(self.feature)
else:
cls_feats = net_func(unpad_feature)
if self.is_train_phase:
logger.info("%s has been added in the TextClassifierTask!" %
self.network)
else: else:
# not use pre-defined net but to use fc net # not use pre-defined net but to use fc net
cls_feats = fluid.layers.dropout( cls_feats = fluid.layers.dropout(
...@@ -286,12 +301,15 @@ class TextClassifierTask(ClassifierTask): ...@@ -286,12 +301,15 @@ class TextClassifierTask(ClassifierTask):
@property @property
def feed_list(self): def feed_list(self):
feed_list = [varname for varname in self._base_feed_list] if self._compatible_mode:
if isinstance(self._base_data_reader, ClassifyReader): feed_list = [varname for varname in self._base_feed_list]
# ClassifyReader will return the seqence length of an input text if isinstance(self._base_data_reader, ClassifyReader):
feed_list += [self.seq_len.name] # ClassifyReader will return the seqence length of an input text
if self.is_train_phase or self.is_test_phase: feed_list += [self.seq_len.name]
feed_list += [self.labels[0].name] if self.is_train_phase or self.is_test_phase:
feed_list += [self.labels[0].name]
else:
feed_list = super(TextClassifierTask, self).feed_list
return feed_list return feed_list
@property @property
...@@ -303,11 +321,10 @@ class TextClassifierTask(ClassifierTask): ...@@ -303,11 +321,10 @@ class TextClassifierTask(ClassifierTask):
] ]
else: else:
# predict phase # predict phase
fetch_list = [self.outputs[0].name] if isinstance(self._base_data_reader, LACClassifyReader):
fetch_list = [self.outputs[0].name]
if isinstance(self._base_data_reader, ClassifyReader): else:
# to avoid save_inference_model to prune seq_len variable fetch_list = [self.outputs[0].name, self.seq_len.name]
fetch_list += [self.seq_len.name]
return fetch_list return fetch_list
...@@ -316,8 +333,9 @@ class MultiLabelClassifierTask(ClassifierTask): ...@@ -316,8 +333,9 @@ class MultiLabelClassifierTask(ClassifierTask):
def __init__(self, def __init__(self,
feature, feature,
num_classes, num_classes,
feed_list, dataset=None,
data_reader, feed_list=None,
data_reader=None,
startup_program=None, startup_program=None,
config=None, config=None,
hidden_units=None, hidden_units=None,
...@@ -325,8 +343,8 @@ class MultiLabelClassifierTask(ClassifierTask): ...@@ -325,8 +343,8 @@ class MultiLabelClassifierTask(ClassifierTask):
if metrics_choices == "default": if metrics_choices == "default":
metrics_choices = ["auc"] metrics_choices = ["auc"]
main_program = feature.block.program
super(MultiLabelClassifierTask, self).__init__( super(MultiLabelClassifierTask, self).__init__(
dataset=dataset,
data_reader=data_reader, data_reader=data_reader,
feature=feature, feature=feature,
num_classes=num_classes, num_classes=num_classes,
...@@ -335,7 +353,10 @@ class MultiLabelClassifierTask(ClassifierTask): ...@@ -335,7 +353,10 @@ class MultiLabelClassifierTask(ClassifierTask):
config=config, config=config,
hidden_units=hidden_units, hidden_units=hidden_units,
metrics_choices=metrics_choices) metrics_choices=metrics_choices)
self.class_name = list(data_reader.label_map.keys()) if self._compatible_mode:
self.class_name = list(data_reader.label_map.keys())
else:
self.class_name = self._label_list
def _build_net(self): def _build_net(self):
cls_feats = fluid.layers.dropout( cls_feats = fluid.layers.dropout(
...@@ -428,13 +449,22 @@ class MultiLabelClassifierTask(ClassifierTask): ...@@ -428,13 +449,22 @@ class MultiLabelClassifierTask(ClassifierTask):
def _postprocessing(self, run_states): def _postprocessing(self, run_states):
results = [] results = []
label_list = list(self._base_data_reader.label_map.keys()) if self._compatible_mode:
label_list = list(self._base_data_reader.label_map.keys())
else:
if self._label_list:
label_list = self._label_list
else:
logger.warning(
"Fail to postprocess the predict output. Please set label_list parameter in predict function or initialize the task with dataset parameter."
)
return run_states
for batch_state in run_states: for batch_state in run_states:
batch_result = batch_state.run_results batch_result = batch_state.run_results
for sample_id in range(len(batch_result[0])): for sample_id in range(len(batch_result[0])):
sample_result = [] sample_result = []
for category_id in range( for category_id in range(len(label_list)):
self._base_data_reader.dataset.num_labels):
sample_category_prob = batch_result[category_id][sample_id] sample_category_prob = batch_result[category_id][sample_id]
sample_category_value = np.argmax(sample_category_prob) sample_category_value = np.argmax(sample_category_prob)
sample_result.append( sample_result.append(
......
...@@ -29,8 +29,9 @@ from .base_task import BaseTask ...@@ -29,8 +29,9 @@ from .base_task import BaseTask
class RegressionTask(BaseTask): class RegressionTask(BaseTask):
def __init__(self, def __init__(self,
feature, feature,
feed_list, dataset=None,
data_reader, feed_list=None,
data_reader=None,
startup_program=None, startup_program=None,
config=None, config=None,
hidden_units=None, hidden_units=None,
...@@ -40,6 +41,7 @@ class RegressionTask(BaseTask): ...@@ -40,6 +41,7 @@ class RegressionTask(BaseTask):
main_program = feature.block.program main_program = feature.block.program
super(RegressionTask, self).__init__( super(RegressionTask, self).__init__(
dataset=dataset,
data_reader=data_reader, data_reader=data_reader,
main_program=main_program, main_program=main_program,
feed_list=feed_list, feed_list=feed_list,
......
...@@ -21,10 +21,9 @@ import time ...@@ -21,10 +21,9 @@ import time
from collections import OrderedDict from collections import OrderedDict
import numpy as np import numpy as np
import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddlehub.finetune.evaluate import chunk_eval, calculate_f1 from paddlehub.finetune.evaluate import chunk_eval, calculate_f1
from paddlehub.common.utils import version_compare from paddlehub.common.logger import logger
from .base_task import BaseTask from .base_task import BaseTask
...@@ -33,8 +32,9 @@ class SequenceLabelTask(BaseTask): ...@@ -33,8 +32,9 @@ class SequenceLabelTask(BaseTask):
feature, feature,
max_seq_len, max_seq_len,
num_classes, num_classes,
feed_list, dataset=None,
data_reader, feed_list=None,
data_reader=None,
startup_program=None, startup_program=None,
config=None, config=None,
metrics_choices="default", metrics_choices="default",
...@@ -46,6 +46,7 @@ class SequenceLabelTask(BaseTask): ...@@ -46,6 +46,7 @@ class SequenceLabelTask(BaseTask):
main_program = feature.block.program main_program = feature.block.program
super(SequenceLabelTask, self).__init__( super(SequenceLabelTask, self).__init__(
dataset=dataset,
data_reader=data_reader, data_reader=data_reader,
main_program=main_program, main_program=main_program,
feed_list=feed_list, feed_list=feed_list,
...@@ -199,11 +200,14 @@ class SequenceLabelTask(BaseTask): ...@@ -199,11 +200,14 @@ class SequenceLabelTask(BaseTask):
@property @property
def feed_list(self): def feed_list(self):
feed_list = [varname for varname in self._base_feed_list] if self._compatible_mode:
if self.is_train_phase or self.is_test_phase: feed_list = [varname for varname in self._base_feed_list]
feed_list += [self.labels[0].name, self.seq_len.name] if self.is_train_phase or self.is_test_phase:
feed_list += [self.labels[0].name, self.seq_len.name]
else:
feed_list += [self.seq_len.name]
else: else:
feed_list += [self.seq_len.name] feed_list = super(SequenceLabelTask, self).feed_list
return feed_list return feed_list
@property @property
...@@ -215,10 +219,22 @@ class SequenceLabelTask(BaseTask): ...@@ -215,10 +219,22 @@ class SequenceLabelTask(BaseTask):
return [output.name for output in self.outputs] return [output.name for output in self.outputs]
def _postprocessing(self, run_states): def _postprocessing(self, run_states):
id2label = { if self._compatible_mode:
val: key id2label = {
for key, val in self._base_data_reader.label_map.items() val: key
} for key, val in self._base_data_reader.label_map.items()
}
else:
if self._label_list:
id2label = {}
for index, label in enumerate(self._label_list):
id2label[index] = label
else:
logger.warning(
"Fail to postprocess the predict output. Please set label_list parameter in predict function or initialize the task with dataset parameter."
)
return run_states
results = [] results = []
for batch_states in run_states: for batch_states in run_states:
batch_results = batch_states.run_results batch_results = batch_states.run_results
......
...@@ -688,11 +688,13 @@ class Features(object): ...@@ -688,11 +688,13 @@ class Features(object):
s = "" s = ""
s += "unique_id: %s " % self.unique_id s += "unique_id: %s " % self.unique_id
s += "example_index: %s " % self.example_index s += "example_index: %s " % self.example_index
s += "doc_span_index: %s" % self.doc_span_index
s += "tokens: %s" % self.tokens
s += "token_to_orig_map %s" % self.token_to_orig_map
s += "token_is_max_context %s" % self.token_is_max_context
s += "start_position: %s " % self.start_position s += "start_position: %s " % self.start_position
s += "end_position: %s " % self.end_position s += "end_position: %s " % self.end_position
s += "is_impossible: %s " % self.is_impossible s += "is_impossible: %s " % self.is_impossible
# s += "tokens: %s" % self.tokens
# s += "token_to_orig_map %s" % self.token_to_orig_map
return s return s
......
...@@ -140,29 +140,6 @@ class FullTokenizer(object): ...@@ -140,29 +140,6 @@ class FullTokenizer(object):
return convert_by_vocab(self.inv_vocab, ids) return convert_by_vocab(self.inv_vocab, ids)
class CharTokenizer(object):
"""Runs end-to-end tokenziation."""
def __init__(self, vocab_file, do_lower_case=True):
self.vocab = load_vocab(vocab_file)
self.inv_vocab = {v: k for k, v in self.vocab.items()}
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
def tokenize(self, text):
split_tokens = []
for token in text.lower().split(" "):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)
return split_tokens
def convert_tokens_to_ids(self, tokens):
return convert_by_vocab(self.vocab, tokens)
def convert_ids_to_tokens(self, ids):
return convert_by_vocab(self.inv_vocab, ids)
class WSSPTokenizer(object): class WSSPTokenizer(object):
def __init__(self, vocab_file, sp_model_dir, word_dict, ws=True, def __init__(self, vocab_file, sp_model_dir, word_dict, ws=True,
lower=True): lower=True):
......
from .bert_tokenizer import BertTokenizer
from .bert_tokenizer import ErnieTinyTokenizer
此差异已折叠。
from collections import OrderedDict
import unicodedata
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = {}
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n").split("\t")[0]
vocab[token] = index
return vocab
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
def is_whitespace(char):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
cat = unicodedata.category(char)
if cat == "Zs":
return True
return False
def is_control(char):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if char == "\t" or char == "\n" or char == "\r":
return False
cat = unicodedata.category(char)
if cat.startswith("C"):
return True
return False
def is_punctuation(char):
"""Checks whether `chars` is a punctuation character."""
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (
cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
return True
cat = unicodedata.category(char)
if cat.startswith("P"):
return True
return False
def is_chinese_char(char):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
cp = ord(char)
if ((cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF) #
or (cp >= 0x20000 and cp <= 0x2A6DF) #
or (cp >= 0x2A700 and cp <= 0x2B73F) #
or (cp >= 0x2B740 and cp <= 0x2B81F) #
or (cp >= 0x2B820 and cp <= 0x2CEAF) #
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F) #
): #
return True
return False
...@@ -8,6 +8,7 @@ visualdl >= 2.0.0b ...@@ -8,6 +8,7 @@ visualdl >= 2.0.0b
cma >= 2.7.0 cma >= 2.7.0
sentencepiece sentencepiece
colorlog colorlog
tqdm
# pandas no longer support python2 in version 0.25 and above # pandas no longer support python2 in version 0.25 and above
pandas ; python_version >= "3" pandas ; python_version >= "3"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册