未验证 提交 a1cf32cd 编写于 作者: K kinghuin 提交者: GitHub

Tokenizer refactor (#677)

上级 a253ecaa
......@@ -39,18 +39,17 @@ if __name__ == '__main__':
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use MultiLabelReader to read dataset
dataset = hub.dataset.Toxic()
reader = hub.reader.MultiLabelClassifyReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len)
# Setup feed list for data feeder
feed_list = [
inputs["input_ids"].name, inputs["position_ids"].name,
inputs["segment_ids"].name, inputs["input_mask"].name
]
# Use the appropriate tokenizer to preprocess the data set
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
dataset = hub.dataset.Toxic(
tokenizer=tokenizer, max_seq_len=args.max_seq_len)
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
......@@ -72,9 +71,8 @@ if __name__ == '__main__':
# Define a classfication fine-tune task by PaddleHub's API
multi_label_cls_task = hub.MultiLabelClassifierTask(
data_reader=reader,
dataset=dataset,
feature=pooled_output,
feed_list=feed_list,
num_classes=dataset.num_labels,
config=config)
......
......@@ -45,20 +45,11 @@ if __name__ == '__main__':
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use MultiLabelReader to read dataset
# Download dataset and get its label list and label num
# If you just want labels information, you can omit its tokenizer parameter to avoid preprocessing the train set.
dataset = hub.dataset.Toxic()
reader = hub.reader.MultiLabelClassifyReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len)
# Setup feed list for data feeder
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
num_classes = dataset.num_labels
label_list = dataset.get_labels()
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
......@@ -75,20 +66,29 @@ if __name__ == '__main__':
# Define a classfication fine-tune task by PaddleHub's API
multi_label_cls_task = hub.MultiLabelClassifierTask(
data_reader=reader,
dataset=dataset,
feature=pooled_output,
feed_list=feed_list,
num_classes=dataset.num_labels,
config=config)
# Data to be predicted
data = [
[
"Yes you did. And you admitted to doing it. See the Warren Kinsella talk page."
],
[
"I asked you a question. We both know you have my page on your watch list, so are why are you playing games and making me formally ping you? Makin'Bacon"
],
"Yes you did. And you admitted to doing it. See the Warren Kinsella talk page.",
"I asked you a question. We both know you have my page on your watch list, so are why are you playing games and making me formally ping you? Makin'Bacon",
]
# Use the appropriate tokenizer to preprocess the data
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
print(multi_label_cls_task.predict(data=data, return_result=True))
encoded_data = [
tokenizer.encode(text=text, max_seq_len=args.max_seq_len)
for text in data
]
print(
multi_label_cls_task.predict(data=encoded_data, label_list=label_list))
......@@ -36,31 +36,28 @@ args = parser.parse_args()
if __name__ == '__main__':
# Load Paddlehub ERNIE pretrained model
module = hub.Module(name="ernie")
module = hub.Module(name="ernie_tiny")
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use ClassifyReader to read dataset
dataset = hub.dataset.NLPCC_DBQA()
reader = hub.reader.ClassifyReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len)
# Use the appropriate tokenizer to preprocess the data set
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
dataset = hub.dataset.NLPCC_DBQA(
tokenizer=tokenizer, max_seq_len=args.max_seq_len)
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"]
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Select fine-tune strategy, setup config and fine-tune
strategy = hub.AdamWeightDecayStrategy(
warmup_proportion=args.warmup_proportion,
......@@ -78,9 +75,8 @@ if __name__ == '__main__':
# Define a classfication fine-tune task by PaddleHub's API
cls_task = hub.TextClassifierTask(
data_reader=reader,
dataset=dataset,
feature=pooled_output,
feed_list=feed_list,
num_classes=dataset.num_labels,
config=config)
......
......@@ -39,30 +39,20 @@ args = parser.parse_args()
if __name__ == '__main__':
# loading Paddlehub ERNIE pretrained model
module = hub.Module(name="ernie")
module = hub.Module(name="ernie_tiny")
inputs, outputs, program = module.context(max_seq_len=args.max_seq_len)
# Sentence classification dataset reader
# Download dataset and get its label list and label num
# If you just want labels information, you can omit its tokenizer parameter to avoid preprocessing the train set.
dataset = hub.dataset.NLPCC_DBQA()
reader = hub.reader.ClassifyReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len)
num_classes = dataset.num_labels
label_list = dataset.get_labels()
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"]
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig(
use_data_parallel=False,
......@@ -73,9 +63,8 @@ if __name__ == '__main__':
# Define a classfication fine-tune task by PaddleHub's API
cls_task = hub.TextClassifierTask(
data_reader=reader,
dataset=dataset,
feature=pooled_output,
feed_list=feed_list,
num_classes=dataset.num_labels,
config=config)
......@@ -83,5 +72,18 @@ if __name__ == '__main__':
data = [["北京奥运博物馆的场景效果负责人是谁?", "主要承担奥运文物征集、保管、研究和爱国主义教育基地建设相关工作。"],
["北京奥运博物馆的场景效果负责人是谁", "于海勃,美国加利福尼亚大学教授 场景效果负责人 总设计师"],
["北京奥运博物馆的场景效果负责人是谁?", "洪麦恩,清华大学美术学院教授 内容及主展线负责人 总设计师"]]
print(cls_task.predict(data=data, return_result=True))
# Use the appropriate tokenizer to preprocess the data
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
encoded_data = [
tokenizer.encode(
text=text, text_pair=text_pair, max_seq_len=args.max_seq_len)
for text, text_pair in data
]
print(cls_task.predict(data=encoded_data, label_list=label_list))
......@@ -17,7 +17,6 @@
import argparse
import ast
import paddle.fluid as fluid
import paddlehub as hub
hub.common.logger.logger.setLevel("INFO")
......@@ -42,28 +41,23 @@ if __name__ == '__main__':
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use ReadingComprehensionReader to read dataset
# Use the appropriate tokenizer to preprocess the data set
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
# If you wanna load SQuAD 2.0 dataset, just set version_2_with_negative as True
dataset = hub.dataset.SQUAD(version_2_with_negative=False)
dataset = hub.dataset.SQUAD(
version_2_with_negative=False,
tokenizer=tokenizer,
max_seq_len=args.max_seq_len)
# dataset = hub.dataset.SQUAD(version_2_with_negative=True)
reader = hub.reader.ReadingComprehensionReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len,
doc_stride=128,
max_query_length=64)
seq_output = outputs["sequence_output"]
# Setup feed list for data feeder
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Select fine-tune strategy, setup config and fine-tune
strategy = hub.AdamWeightDecayStrategy(
weight_decay=args.weight_decay,
......@@ -72,7 +66,7 @@ if __name__ == '__main__':
# Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig(
eval_interval=300,
eval_interval=100,
use_data_parallel=args.use_data_parallel,
use_cuda=args.use_gpu,
num_epoch=args.num_epoch,
......@@ -82,9 +76,8 @@ if __name__ == '__main__':
# Define a reading comprehension fine-tune task by PaddleHub's API
reading_comprehension_task = hub.ReadingComprehensionTask(
data_reader=reader,
feature=seq_output,
feed_list=feed_list,
dataset=dataset,
feature=outputs["sequence_output"],
config=config,
sub_task="squad",
)
......
......@@ -20,12 +20,6 @@ from __future__ import print_function
import argparse
import ast
import numpy as np
import os
import time
import paddle
import paddle.fluid as fluid
import paddlehub as hub
# yapf: disable
......@@ -43,27 +37,11 @@ if __name__ == '__main__':
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use RegressionReader to read dataset
dataset = hub.dataset.GLUE("STS-B")
reader = hub.reader.RegressionReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len)
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"]
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig(
use_data_parallel=False,
......@@ -74,13 +52,22 @@ if __name__ == '__main__':
# Define a regression fine-tune task by PaddleHub's API
reg_task = hub.RegressionTask(
data_reader=reader,
feature=pooled_output,
feed_list=feed_list,
config=config,
)
# Data to be prdicted
data = [[d.text_a, d.text_b] for d in dataset.get_predict_examples()[:10]]
print(reg_task.predict(data=data, return_result=True))
# STS-B has provided the predict data, and the dataset has process it. If you want to process customized data,
# see the predict.py in text_classification demo
# Use the appropriate tokenizer to preprocess the data
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
dataset = hub.dataset.GLUE(
"STS-B", tokenizer=tokenizer, max_seq_len=args.max_seq_len)
encoded_data = dataset.get_predict_records()[:10]
print(reg_task.predict(data=encoded_data))
......@@ -17,7 +17,6 @@
import argparse
import ast
import paddle.fluid as fluid
import paddlehub as hub
# yapf: disable
......@@ -41,27 +40,24 @@ if __name__ == '__main__':
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use RegressionReader to read dataset
dataset = hub.dataset.GLUE("STS-B")
reader = hub.reader.RegressionReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len)
# Use the appropriate tokenizer to preprocess the data set
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
dataset = hub.dataset.GLUE(
"STS-B", tokenizer=tokenizer, max_seq_len=args.max_seq_len)
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"]
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Select fine-tune strategy, setup config and fine-tune
strategy = hub.AdamWeightDecayStrategy(
warmup_proportion=args.warmup_proportion,
......@@ -70,7 +66,6 @@ if __name__ == '__main__':
# Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig(
eval_interval=300,
use_data_parallel=args.use_data_parallel,
use_cuda=args.use_gpu,
num_epoch=args.num_epoch,
......@@ -80,10 +75,7 @@ if __name__ == '__main__':
# Define a regression fine-tune task by PaddleHub's API
reg_task = hub.RegressionTask(
data_reader=reader,
feature=pooled_output,
feed_list=feed_list,
config=config)
dataset=dataset, feature=pooled_output, config=config)
# Fine-tune and evaluate by PaddleHub's API
# will finish training, evaluation, testing, save model automatically
......
......@@ -42,30 +42,16 @@ if __name__ == '__main__':
module = hub.Module(name="ernie_tiny")
inputs, outputs, program = module.context(max_seq_len=args.max_seq_len)
# Sentence labeling dataset reader
# Download dataset and get its label list and label num
# If you just want labels information, you can omit its tokenizer parameter to avoid preprocessing the train set.
dataset = hub.dataset.MSRA_NER()
reader = hub.reader.SequenceLabelReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len,
sp_model_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
inv_label_map = {val: key for key, val in reader.label_map.items()}
num_classes = dataset.num_labels
label_list = dataset.get_labels()
# Construct transfer learning network
# Use "sequence_output" for token-level output.
sequence_output = outputs["sequence_output"]
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig(
use_data_parallel=False,
......@@ -77,33 +63,31 @@ if __name__ == '__main__':
# Define a sequence labeling fine-tune task by PaddleHub's API
# if add crf, the network use crf as decoder
seq_label_task = hub.SequenceLabelTask(
data_reader=reader,
feature=sequence_output,
feed_list=feed_list,
max_seq_len=args.max_seq_len,
num_classes=dataset.num_labels,
num_classes=num_classes,
config=config,
add_crf=False)
# Data to be predicted
# If using python 2, prefix "u" is necessary
data = [
[u"我们变而以书会友,以书结缘,把欧美、港台流行的食品类图谱、画册、工具书汇集一堂。"],
[u"为了跟踪国际最新食品工艺、流行趋势,大量搜集海外专业书刊资料是提高技艺的捷径。"],
[u"其中线装古籍逾千册;民国出版物几百种;珍本四册、稀见本四百余册,出版时间跨越三百余年。"],
[u"有的古木交柯,春机荣欣,从诗人句中得之,而入画中,观之令人心驰。"],
[u"不过重在晋趣,略增明人气息,妙在集古有道、不露痕迹罢了。"],
text_a = [
"我们变而以书会友,以书结缘,把欧美、港台流行的食品类图谱、画册、工具书汇集一堂。",
"为了跟踪国际最新食品工艺、流行趋势,大量搜集海外专业书刊资料是提高技艺的捷径。",
"其中线装古籍逾千册;民国出版物几百种;珍本四册、稀见本四百余册,出版时间跨越三百余年。",
"有的古木交柯,春机荣欣,从诗人句中得之,而入画中,观之令人心驰。",
"不过重在晋趣,略增明人气息,妙在集古有道、不露痕迹罢了。",
]
# Add 0x02 between characters to match the format of training data,
# otherwise the length of prediction results will not match the input string
# if the input string contains non-Chinese characters.
tmp_data = []
for example in data:
formatted = []
for sentence in example:
formatted.append('\x02'.join(list(sentence)))
tmp_data.append(formatted)
data = tmp_data
formatted_text_a = list(map("\002".join, text_a))
print(seq_label_task.predict(data=data, return_result=True))
# Use the appropriate tokenizer to preprocess the data
# For ernie_tiny, it use BertTokenizer too.
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
encoded_data = [
tokenizer.encode(text=text, max_seq_len=args.max_seq_len)
for text in formatted_text_a
]
print(seq_label_task.predict(data=encoded_data, label_list=label_list))
......@@ -40,26 +40,16 @@ if __name__ == '__main__':
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use SequenceLabelReader to read dataset
dataset = hub.dataset.MSRA_NER()
reader = hub.reader.SequenceLabelReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len,
sp_model_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
# Use the appropriate tokenizer to preprocess the data set
# For ernie_tiny, it use BertTokenizer too.
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
dataset = hub.dataset.MSRA_NER(
tokenizer=tokenizer, max_seq_len=args.max_seq_len)
# Construct transfer learning network
# Use "sequence_output" for token-level output.
sequence_output = outputs["sequence_output"]
# Setup feed list for data feeder
# Must feed all the tensor of module need
feed_list = [
inputs["input_ids"].name, inputs["position_ids"].name,
inputs["segment_ids"].name, inputs["input_mask"].name
]
# Select a fine-tune strategy
strategy = hub.AdamWeightDecayStrategy(
warmup_proportion=args.warmup_proportion,
......@@ -78,9 +68,8 @@ if __name__ == '__main__':
# Define a sequence labeling fine-tune task by PaddleHub's API
# If add crf, the network use crf as decoder
seq_label_task = hub.SequenceLabelTask(
data_reader=reader,
dataset=dataset,
feature=sequence_output,
feed_list=feed_list,
max_seq_len=args.max_seq_len,
num_classes=dataset.num_labels,
config=config,
......
......@@ -21,9 +21,9 @@ parser.add_argument("--max_seq_len", type=int, default=512,
# yapf: enable.
class TransformerSequenceLabelLayer(fluid.dygraph.Layer):
class TransformerSeqLabeling(fluid.dygraph.Layer):
def __init__(self, num_classes, transformer):
super(TransformerSequenceLabelLayer, self).__init__()
super(TransformerSeqLabeling, self).__init__()
self.num_classes = num_classes
self.transformer = transformer
self.fc = Linear(input_dim=768, output_dim=num_classes)
......@@ -39,11 +39,15 @@ class TransformerSequenceLabelLayer(fluid.dygraph.Layer):
def finetune(args):
ernie = hub.Module(name="ernie", max_seq_len=args.max_seq_len)
module = hub.Module(name="ernie", max_seq_len=args.max_seq_len)
# Use the appropriate tokenizer to preprocess the data set
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
dataset = hub.dataset.MSRA_NER(
tokenizer=tokenizer, max_seq_len=args.max_seq_len)
with fluid.dygraph.guard():
dataset = hub.dataset.MSRA_NER()
ts = TransformerSequenceLabelLayer(
num_classes=dataset.num_labels, transformer=ernie)
ts = TransformerSeqLabeling(
num_classes=dataset.num_labels, transformer=module)
adam = AdamOptimizer(learning_rate=1e-5, parameter_list=ts.parameters())
state_dict_path = os.path.join(args.checkpoint_dir,
'dygraph_state_dict')
......@@ -51,34 +55,32 @@ def finetune(args):
state_dict, _ = fluid.load_dygraph(state_dict_path)
ts.load_dict(state_dict)
reader = hub.reader.SequenceLabelReader(
dataset=dataset,
vocab_path=ernie.get_vocab_path(),
max_seq_len=args.max_seq_len,
sp_model_path=ernie.get_spm_path(),
word_dict_path=ernie.get_word_dict_path())
train_reader = reader.data_generator(
batch_size=args.batch_size, phase='train')
loss_sum = total_infer = total_label = total_correct = cnt = 0
# 执行epoch_num次训练
for epoch in range(args.num_epoch):
# 读取训练数据进行训练
for batch_id, data in enumerate(train_reader()):
input_ids = np.array(data[0][0]).astype(np.int64)
position_ids = np.array(data[0][1]).astype(np.int64)
segment_ids = np.array(data[0][2]).astype(np.int64)
input_mask = np.array(data[0][3]).astype(np.float32)
labels = np.array(data[0][4]).astype(np.int64).reshape(-1, 1)
seq_len = np.squeeze(
np.array(data[0][5]).astype(np.int64), axis=1)
for batch_id, data in enumerate(
dataset.batch_records_generator(
phase="train",
batch_size=args.batch_size,
shuffle=True,
pad_to_batch_max_seq_len=False)):
batch_size = len(data["input_ids"])
input_ids = np.array(data["input_ids"]).astype(
np.int64).reshape([batch_size, -1, 1])
position_ids = np.array(data["position_ids"]).astype(
np.int64).reshape([batch_size, -1, 1])
segment_ids = np.array(data["segment_ids"]).astype(
np.int64).reshape([batch_size, -1, 1])
input_mask = np.array(data["input_mask"]).astype(
np.float32).reshape([batch_size, -1, 1])
labels = np.array(data["label"]).astype(np.int64).reshape(-1, 1)
seq_len = np.array(data["seq_len"]).astype(np.int64).reshape(
-1, 1)
pred, ret_infers = ts(input_ids, position_ids, segment_ids,
input_mask)
loss = fluid.layers.cross_entropy(pred, to_variable(labels))
avg_loss = fluid.layers.mean(loss)
avg_loss.backward()
# 参数更新
adam.minimize(avg_loss)
loss_sum += avg_loss.numpy() * labels.shape[0]
......
......@@ -20,11 +20,7 @@ from __future__ import print_function
import argparse
import ast
import numpy as np
import os
import time
import paddle
import paddle.fluid as fluid
import paddlehub as hub
# yapf: disable
......@@ -43,32 +39,11 @@ if __name__ == '__main__':
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use accuracy as metrics
# Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC
# Download dataset and get its label list and label num
# If you just want labels information, you can omit its tokenizer parameter to avoid preprocessing the train set.
dataset = hub.dataset.ChnSentiCorp()
# For ernie_tiny, it use sub-word to tokenize chinese sentence
# If not ernie tiny, sp_model_path and word_dict_path should be set None
reader = hub.reader.ClassifyReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len,
sp_model_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"]
# Setup feed list for data feeder
# Must feed all the tensor of module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
num_classes = dataset.num_labels
label_list = dataset.get_labels()
# Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig(
......@@ -80,14 +55,26 @@ if __name__ == '__main__':
# Define a classfication fine-tune task by PaddleHub's API
cls_task = hub.TextClassifierTask(
data_reader=reader,
feature=pooled_output,
feed_list=feed_list,
num_classes=dataset.num_labels,
feature=outputs["pooled_output"],
num_classes=num_classes,
config=config)
# Data to be prdicted
data = [["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"], ["交通方便;环境很好;服务态度很好 房间较小"],
["19天硬盘就罢工了~~~算上运来的一周都没用上15天~~~可就是不能换了~~~唉~~~~你说这算什么事呀~~~"]]
print(cls_task.predict(data=data, return_result=True))
text_a = [
"这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般", "交通方便;环境很好;服务态度很好 房间较小",
"19天硬盘就罢工了~~~算上运来的一周都没用上15天~~~可就是不能换了~~~唉~~~~你说这算什么事呀~~~"
]
# Use the appropriate tokenizer to preprocess the data
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
encoded_data = [
tokenizer.encode(text=text, max_seq_len=args.max_seq_len)
for text in text_a
]
print(cls_task.predict(data=encoded_data, label_list=label_list))
......@@ -20,11 +20,7 @@ from __future__ import print_function
import argparse
import ast
import numpy as np
import os
import time
import paddle
import paddle.fluid as fluid
import paddlehub as hub
# yapf: disable
......@@ -44,33 +40,17 @@ if __name__ == '__main__':
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use accuracy as metrics
# Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC
# Download dataset and get its label list and label num
# If you just want labels information, you can omit its tokenizer parameter to avoid preprocessing the train set.
dataset = hub.dataset.ChnSentiCorp()
# For ernie_tiny, it use sub-word to tokenize chinese sentence
# If not ernie tiny, sp_model_path and word_dict_path should be set None
reader = hub.reader.ClassifyReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len,
sp_model_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
num_classes = dataset.num_labels
label_list = dataset.get_labels()
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
token_feature = outputs["sequence_output"]
# Setup feed list for data feeder
# Must feed all the tensor of module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig(
use_data_parallel=args.use_data_parallel,
......@@ -85,15 +65,27 @@ if __name__ == '__main__':
# you must use the outputs["sequence_output"] as the token_feature of TextClassifierTask,
# rather than outputs["pooled_output"], and feature is None
cls_task = hub.TextClassifierTask(
data_reader=reader,
token_feature=token_feature,
feed_list=feed_list,
network=args.network,
num_classes=dataset.num_labels,
config=config)
# Data to be prdicted
data = [["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"], ["交通方便;环境很好;服务态度很好 房间较小"],
["19天硬盘就罢工了~~~算上运来的一周都没用上15天~~~可就是不能换了~~~唉~~~~你说这算什么事呀~~~"]]
print(cls_task.predict(data=data, return_result=True))
text_a = [
"这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般", "交通方便;环境很好;服务态度很好 房间较小",
"19天硬盘就罢工了~~~算上运来的一周都没用上15天~~~可就是不能换了~~~唉~~~~你说这算什么事呀~~~"
]
# Use the appropriate tokenizer to preprocess the data
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
encoded_data = [
tokenizer.encode(text=text, max_seq_len=args.max_seq_len)
for text in text_a
]
print(cls_task.predict(data=encoded_data, label_list=label_list))
......@@ -7,5 +7,5 @@ python -u predict_predefine_net.py \
--checkpoint_dir=$CKPT_DIR \
--max_seq_len=128 \
--use_gpu=True \
--batch_size=24 \
--batch_size=1 \
--network=bilstm
......@@ -40,11 +40,23 @@ class TransformerClassifier(fluid.dygraph.Layer):
def finetune(args):
ernie = hub.Module(name="ernie", max_seq_len=args.max_seq_len)
module = hub.Module(name="ernie", max_seq_len=args.max_seq_len)
# Use the appropriate tokenizer to preprocess the data set
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path(),
)
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
dataset = hub.dataset.ChnSentiCorp(
tokenizer=tokenizer, max_seq_len=args.max_seq_len)
with fluid.dygraph.guard():
dataset = hub.dataset.ChnSentiCorp()
tc = TransformerClassifier(
num_classes=dataset.num_labels, transformer=ernie)
num_classes=dataset.num_labels, transformer=module)
adam = AdamOptimizer(learning_rate=1e-5, parameter_list=tc.parameters())
state_dict_path = os.path.join(args.checkpoint_dir,
'dygraph_state_dict')
......@@ -52,32 +64,31 @@ def finetune(args):
state_dict, _ = fluid.load_dygraph(state_dict_path)
tc.load_dict(state_dict)
reader = hub.reader.ClassifyReader(
dataset=dataset,
vocab_path=ernie.get_vocab_path(),
max_seq_len=args.max_seq_len,
sp_model_path=ernie.get_spm_path(),
word_dict_path=ernie.get_word_dict_path())
train_reader = reader.data_generator(
batch_size=args.batch_size, phase='train')
loss_sum = acc_sum = cnt = 0
# 执行epoch_num次训练
for epoch in range(args.num_epoch):
# 读取训练数据进行训练
for batch_id, data in enumerate(train_reader()):
input_ids = np.array(data[0][0]).astype(np.int64)
position_ids = np.array(data[0][1]).astype(np.int64)
segment_ids = np.array(data[0][2]).astype(np.int64)
input_mask = np.array(data[0][3]).astype(np.float32)
labels = np.array(data[0][4]).astype(np.int64)
for batch_id, data in enumerate(
dataset.batch_records_generator(
phase="train",
batch_size=args.batch_size,
shuffle=True,
pad_to_batch_max_seq_len=False)):
batch_size = len(data["input_ids"])
input_ids = np.array(data["input_ids"]).astype(
np.int64).reshape([batch_size, -1, 1])
position_ids = np.array(data["position_ids"]).astype(
np.int64).reshape([batch_size, -1, 1])
segment_ids = np.array(data["segment_ids"]).astype(
np.int64).reshape([batch_size, -1, 1])
input_mask = np.array(data["input_mask"]).astype(
np.float32).reshape([batch_size, -1, 1])
labels = np.array(data["label"]).astype(np.int64).reshape(
[batch_size, 1])
pred = tc(input_ids, position_ids, segment_ids, input_mask)
acc = fluid.layers.accuracy(pred, to_variable(labels))
loss = fluid.layers.cross_entropy(pred, to_variable(labels))
avg_loss = fluid.layers.mean(loss)
avg_loss.backward()
# 参数更新
adam.minimize(avg_loss)
loss_sum += avg_loss.numpy() * labels.shape[0]
......
......@@ -16,6 +16,7 @@
import argparse
import ast
import paddlehub as hub
# yapf: disable
......@@ -39,35 +40,24 @@ if __name__ == '__main__':
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use accuracy as metrics
# Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC
# metric should be acc, f1 or matthews
dataset = hub.dataset.ChnSentiCorp()
metrics_choices = ["acc"]
# For ernie_tiny, it use sub-word to tokenize chinese sentence
# If not ernie tiny, sp_model_path and word_dict_path should be set None
reader = hub.reader.ClassifyReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len,
sp_model_path=module.get_spm_path(),
# Use the appropriate tokenizer to preprocess the data set
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
dataset = hub.dataset.ChnSentiCorp(
tokenizer=tokenizer, max_seq_len=args.max_seq_len)
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"]
# Setup feed list for data feeder
# Must feed all the tensor of module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Select fine-tune strategy, setup config and fine-tune
strategy = hub.AdamWeightDecayStrategy(
warmup_proportion=args.warmup_proportion,
......@@ -85,12 +75,11 @@ if __name__ == '__main__':
# Define a classfication fine-tune task by PaddleHub's API
cls_task = hub.TextClassifierTask(
data_reader=reader,
dataset=dataset,
feature=pooled_output,
feed_list=feed_list,
num_classes=dataset.num_labels,
config=config,
metrics_choices=metrics_choices)
metrics_choices=["acc"])
# Fine-tune and evaluate by PaddleHub's API
# will finish training, evaluation, testing, save model automatically
......
......@@ -40,35 +40,24 @@ if __name__ == '__main__':
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use accuracy as metrics
# Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC
# metric should be acc, f1 or matthews
dataset = hub.dataset.ChnSentiCorp()
metrics_choices = ["acc"]
# For ernie_tiny, it use sub-word to tokenize chinese sentence
# If not ernie tiny, sp_model_path and word_dict_path should be set None
reader = hub.reader.ClassifyReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len,
sp_model_path=module.get_spm_path(),
# Use the appropriate tokenizer to preprocess the data set
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
dataset = hub.dataset.ChnSentiCorp(
tokenizer=tokenizer, max_seq_len=args.max_seq_len)
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
token_feature = outputs["sequence_output"]
# Setup feed list for data feeder
# Must feed all the tensor of module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Select fine-tune strategy, setup config and fine-tune
strategy = hub.AdamWeightDecayStrategy(
warmup_proportion=args.warmup_proportion,
......@@ -90,13 +79,12 @@ if __name__ == '__main__':
# you must use the outputs["sequence_output"] as the token_feature of TextClassifierTask,
# rather than outputs["pooled_output"], and feature is None
cls_task = hub.TextClassifierTask(
data_reader=reader,
dataset=dataset,
token_feature=token_feature,
feed_list=feed_list,
network=args.network,
num_classes=dataset.num_labels,
config=config,
metrics_choices=metrics_choices)
metrics_choices=["acc"])
# Fine-tune and evaluate by PaddleHub's API
# will finish training, evaluation, testing, save model automatically
......
......@@ -31,6 +31,7 @@ from . import dataset
from . import finetune
from . import reader
from . import network
from . import tokenizer
from .common.dir import USER_HOME
from .common.dir import HUB_HOME
......@@ -70,3 +71,6 @@ from .finetune.strategy import CombinedStrategy
from .autofinetune.evaluator import report_final_result
from .module.nlp_module import NLPPredictionModule, TransformerModule
from .tokenizer.bert_tokenizer import BertTokenizer
from .tokenizer.bert_tokenizer import ErnieTinyTokenizer
......@@ -20,11 +20,16 @@ from __future__ import print_function
import os
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
class BQ(BaseNLPDataset):
def __init__(self):
class BQ(TextClassificationDataset):
"""
The Bank Question (BQ) corpus, a Chinese corpus for sentence semantic equivalence identification (SSEI),
contains 120,000 question pairs from 1-year online bank custom service logs.
"""
def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "bq")
base_path = self._download_dataset(
dataset_dir,
......@@ -36,18 +41,16 @@ class BQ(BaseNLPDataset):
test_file="test.txt",
label_file=None,
label_list=["0", "1"],
)
tokenizer=tokenizer,
max_seq_len=max_seq_len)
if __name__ == "__main__":
ds = BQ()
print("first 10 dev")
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
ds = BQ(tokenizer=BertTokenizer(vocab_file='vocab.txt'), max_seq_len=10)
print("first 10 dev examples")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print("first 10 train")
for e in ds.get_train_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print("first 10 test")
for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
......@@ -23,16 +23,16 @@ import csv
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
class ChnSentiCorp(BaseNLPDataset):
class ChnSentiCorp(TextClassificationDataset):
"""
ChnSentiCorp (by Tan Songbo at ICT of Chinese Academy of Sciences, and for
opinion mining)
"""
def __init__(self):
def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "chnsenticorp")
base_path = self._download_dataset(
dataset_dir,
......@@ -44,7 +44,8 @@ class ChnSentiCorp(BaseNLPDataset):
test_file="test.tsv",
label_file=None,
label_list=["0", "1"],
)
tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
......@@ -63,6 +64,13 @@ class ChnSentiCorp(BaseNLPDataset):
if __name__ == "__main__":
ds = ChnSentiCorp()
for e in ds.get_train_examples()[:10]:
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = ChnSentiCorp(tokenizer=tokenizer, max_seq_len=10)
print("first 10 dev examples")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
......@@ -20,7 +20,7 @@ import os
from paddlehub.reader import tokenization
from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
from paddlehub.dataset.base_nlp_dataset import MRCDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/cmrc2018.tar.gz"
SPIECE_UNDERLINE = '▁'
......@@ -62,10 +62,14 @@ class CMRC2018Example(object):
return s
class CMRC2018(BaseNLPDataset):
class CMRC2018(MRCDataset):
"""A single set of features of data."""
def __init__(self):
def __init__(self,
tokenizer=None,
max_seq_len=None,
max_query_len=64,
doc_stride=128):
dataset_dir = os.path.join(DATA_HOME, "cmrc2018")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(CMRC2018, self).__init__(
......@@ -75,6 +79,10 @@ class CMRC2018(BaseNLPDataset):
test_file=None,
label_file=None,
label_list=None,
tokenizer=tokenizer,
max_seq_len=max_seq_len,
max_query_len=max_query_len,
doc_stride=doc_stride,
)
def _read_file(self, input_file, phase=False):
......@@ -201,7 +209,9 @@ class CMRC2018(BaseNLPDataset):
if __name__ == "__main__":
print("begin")
ds = CMRC2018()
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = CMRC2018(tokenizer=tokenizer, max_seq_len=50)
print("train")
examples = ds.get_train_examples()
for index, e in enumerate(examples):
......
......@@ -121,6 +121,20 @@ class BaseDataset(object):
def get_predict_examples(self):
return self.predict_examples
def get_examples(self, phase):
if phase == "train":
return self.get_train_examples()
elif phase == "dev":
return self.get_dev_examples()
elif phase == "test":
return self.get_test_examples()
elif phase == "val":
return self.get_val_examples()
elif phase == "predict":
return self.get_predict_examples()
else:
raise ValueError("Invalid phase: %s" % phase)
def get_labels(self):
return self.label_list
......
......@@ -20,7 +20,7 @@ import os
from paddlehub.reader import tokenization
from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
from paddlehub.dataset.base_nlp_dataset import MRCDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/drcd.tar.gz"
SPIECE_UNDERLINE = '▁'
......@@ -62,10 +62,16 @@ class DRCDExample(object):
return s
class DRCD(BaseNLPDataset):
class DRCD(MRCDataset):
"""A single set of features of data."""
def __init__(self):
def __init__(
self,
tokenizer=None,
max_seq_len=None,
max_query_len=64,
doc_stride=128,
):
dataset_dir = os.path.join(DATA_HOME, "drcd")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(DRCD, self).__init__(
......@@ -75,6 +81,10 @@ class DRCD(BaseNLPDataset):
test_file="DRCD_test.json",
label_file=None,
label_list=None,
tokenizer=tokenizer,
max_seq_len=max_seq_len,
max_query_len=max_query_len,
doc_stride=doc_stride,
)
def _read_file(self, input_file, phase=None):
......@@ -176,8 +186,8 @@ class DRCD(BaseNLPDataset):
cleaned_answer_text = "".join(
tokenization.whitespace_tokenize(orig_answer_text))
if actual_text.find(cleaned_answer_text) == -1:
logger.warning((actual_text, " vs ",
cleaned_answer_text, " in ", qa))
logger.warning("Could not find answer: '%s' vs. '%s'" %
(actual_text, cleaned_answer_text))
continue
example = DRCDExample(
qas_id=qas_id,
......@@ -191,7 +201,9 @@ class DRCD(BaseNLPDataset):
if __name__ == "__main__":
ds = DRCD()
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = DRCD(tokenizer=tokenizer, max_seq_len=50)
print("train")
examples = ds.get_train_examples()
for index, e in enumerate(examples):
......
......@@ -36,7 +36,7 @@ class GLUE(BaseNLPDataset):
for more information
"""
def __init__(self, sub_dataset='SST-2'):
def __init__(self, sub_dataset='SST-2', tokenizer=None, max_seq_len=None):
# sub_dataset : CoLA, MNLI, MRPC, QNLI, QQP, RTE, SST-2, STS-B
if sub_dataset not in [
'CoLA', 'MNLI', 'MNLI_m', 'MNLI_mm', 'MRPC', 'QNLI', 'QQP',
......@@ -85,7 +85,8 @@ class GLUE(BaseNLPDataset):
predict_file=predict_file,
label_file=None,
label_list=label_list,
)
tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
......@@ -165,11 +166,13 @@ class GLUE(BaseNLPDataset):
if __name__ == "__main__":
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
for sub_dataset in [
'CoLA', 'MNLI', 'MRPC', 'QNLI', 'QQP', 'RTE', 'SST-2', 'STS-B'
]:
print(sub_dataset)
ds = GLUE(sub_dataset=sub_dataset)
ds = GLUE(sub_dataset=sub_dataset, tokenizer=tokenizer, max_seq_len=10)
for e in ds.get_train_examples()[:2]:
print(e)
print()
......@@ -182,3 +185,6 @@ if __name__ == "__main__":
for e in ds.get_predict_examples()[:2]:
print(e)
print()
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
......@@ -22,13 +22,13 @@ import os
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/iflytek.tar.gz"
class IFLYTEK(BaseNLPDataset):
def __init__(self):
class IFLYTEK(TextClassificationDataset):
def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "iflytek")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(IFLYTEK, self).__init__(
......@@ -38,7 +38,8 @@ class IFLYTEK(BaseNLPDataset):
test_file="test.txt",
label_file=None,
label_list=[str(i) for i in range(119)],
)
tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
......@@ -56,7 +57,9 @@ class IFLYTEK(BaseNLPDataset):
if __name__ == "__main__":
ds = IFLYTEK()
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = IFLYTEK(tokenizer=tokenizer, max_seq_len=10)
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
......@@ -67,3 +70,6 @@ if __name__ == "__main__":
for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
......@@ -23,17 +23,17 @@ import csv
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/inews.tar.gz"
class INews(BaseNLPDataset):
class INews(TextClassificationDataset):
"""
INews is a sentiment analysis dataset for Internet News
"""
def __init__(self):
def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "inews")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(INews, self).__init__(
......@@ -43,7 +43,8 @@ class INews(BaseNLPDataset):
test_file="test.txt",
label_file=None,
label_list=["0", "1", "2"],
)
tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
......@@ -60,7 +61,10 @@ class INews(BaseNLPDataset):
if __name__ == "__main__":
ds = INews()
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = INews(tokenizer=tokenizer, max_seq_len=10)
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
......@@ -71,3 +75,6 @@ if __name__ == "__main__":
for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
......@@ -23,13 +23,13 @@ import csv
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/lcqmc.tar.gz"
class LCQMC(BaseNLPDataset):
def __init__(self):
class LCQMC(TextClassificationDataset):
def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "lcqmc")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(LCQMC, self).__init__(
......@@ -39,7 +39,8 @@ class LCQMC(BaseNLPDataset):
test_file="test.tsv",
label_file=None,
label_list=["0", "1"],
)
tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
......@@ -58,7 +59,10 @@ class LCQMC(BaseNLPDataset):
if __name__ == "__main__":
ds = LCQMC()
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = LCQMC(tokenizer=tokenizer, max_seq_len=512)
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
......@@ -69,3 +73,7 @@ if __name__ == "__main__":
for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
......@@ -23,12 +23,12 @@ import csv
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
from paddlehub.dataset.base_nlp_dataset import SeqLabelingDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/msra_ner.tar.gz"
class MSRA_NER(BaseNLPDataset):
class MSRA_NER(SeqLabelingDataset):
"""
A set of manually annotated Chinese word-segmentation data and
specifications for training and testing a Chinese word-segmentation system
......@@ -36,7 +36,7 @@ class MSRA_NER(BaseNLPDataset):
https://www.microsoft.com/en-us/download/details.aspx?id=52531
"""
def __init__(self):
def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "msra_ner")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(MSRA_NER, self).__init__(
......@@ -48,7 +48,8 @@ class MSRA_NER(BaseNLPDataset):
label_list=[
"B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "O"
],
)
tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
......@@ -67,7 +68,9 @@ class MSRA_NER(BaseNLPDataset):
if __name__ == "__main__":
ds = MSRA_NER()
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = MSRA_NER(tokenizer=tokenizer, max_seq_len=30)
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
......@@ -78,3 +81,6 @@ if __name__ == "__main__":
for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
......@@ -23,19 +23,19 @@ import csv
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/nlpcc-dbqa.tar.gz"
class NLPCC_DBQA(BaseNLPDataset):
class NLPCC_DBQA(TextClassificationDataset):
"""
Please refer to
http://tcci.ccf.org.cn/conference/2017/dldoc/taskgline05.pdf
for more information
"""
def __init__(self):
def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "nlpcc-dbqa")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(NLPCC_DBQA, self).__init__(
......@@ -45,7 +45,8 @@ class NLPCC_DBQA(BaseNLPDataset):
test_file="test.tsv",
label_file=None,
label_list=["0", "1"],
)
tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
......@@ -64,7 +65,9 @@ class NLPCC_DBQA(BaseNLPDataset):
if __name__ == "__main__":
ds = NLPCC_DBQA()
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = NLPCC_DBQA(tokenizer=tokenizer, max_seq_len=10)
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
......@@ -75,3 +78,6 @@ if __name__ == "__main__":
for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
......@@ -20,7 +20,7 @@ import os
from paddlehub.reader import tokenization
from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
from paddlehub.dataset.base_nlp_dataset import MRCDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/squad.tar.gz"
......@@ -65,10 +65,17 @@ class SquadExample(object):
return s
class SQUAD(BaseNLPDataset):
class SQUAD(MRCDataset):
"""A single set of features of data."""
def __init__(self, version_2_with_negative=False):
def __init__(
self,
version_2_with_negative=False,
tokenizer=None,
max_seq_len=None,
max_query_len=64,
doc_stride=128,
):
self.version_2_with_negative = version_2_with_negative
if not version_2_with_negative:
train_file = "train-v1.1.json"
......@@ -87,6 +94,10 @@ class SQUAD(BaseNLPDataset):
test_file=None,
label_file=None,
label_list=None,
tokenizer=tokenizer,
max_seq_len=max_seq_len,
max_query_len=max_query_len,
doc_stride=doc_stride,
)
def _read_file(self, input_file, phase=None):
......@@ -177,7 +188,10 @@ class SQUAD(BaseNLPDataset):
if __name__ == "__main__":
ds = SQUAD(version_2_with_negative=True)
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = SQUAD(
version_2_with_negative=True, tokenizer=tokenizer, max_seq_len=512)
print("first 10 dev")
for e in ds.get_dev_examples()[:2]:
print(e)
......
......@@ -22,13 +22,13 @@ import os
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/thucnews.tar.gz"
class THUCNEWS(BaseNLPDataset):
def __init__(self):
class THUCNEWS(TextClassificationDataset):
def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "thucnews")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(THUCNEWS, self).__init__(
......@@ -38,7 +38,8 @@ class THUCNEWS(BaseNLPDataset):
test_file="test.txt",
label_file=None,
label_list=[str(i) for i in range(14)],
)
tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
......@@ -56,7 +57,9 @@ class THUCNEWS(BaseNLPDataset):
if __name__ == "__main__":
ds = THUCNEWS()
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = THUCNEWS(tokenizer=tokenizer, max_seq_len=10)
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
......@@ -67,3 +70,6 @@ if __name__ == "__main__":
for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
......@@ -20,7 +20,8 @@ from __future__ import print_function
import io
import os
from paddlehub.dataset import InputExample, BaseDataset
from paddlehub.dataset import InputExample
from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
from paddlehub.common.dir import DATA_HOME
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/tnews.tar.gz"
......@@ -44,12 +45,12 @@ LABEL_NAME = {
}
class TNews(BaseDataset):
class TNews(TextClassificationDataset):
"""
TNews is the chinese news classification dataset on Jinri Toutiao App.
"""
def __init__(self):
def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "tnews")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
label_list = [
......@@ -63,7 +64,8 @@ class TNews(BaseDataset):
test_file="toutiao_category_test.txt",
label_file=None,
label_list=label_list,
)
tokenizer=tokenizer,
max_seq_len=max_seq_len)
def get_label_name(self, id):
return LABEL_NAME[id]
......@@ -82,7 +84,9 @@ class TNews(BaseDataset):
if __name__ == "__main__":
ds = TNews()
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = TNews(tokenizer=tokenizer, max_seq_len=10)
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
......@@ -93,3 +97,6 @@ if __name__ == "__main__":
for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
......@@ -22,18 +22,18 @@ import pandas as pd
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
from paddlehub.dataset.base_nlp_dataset import MultiLabelDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/toxic.tar.gz"
class Toxic(BaseNLPDataset):
class Toxic(MultiLabelDataset):
"""
The kaggle Toxic dataset:
https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge
"""
def __init__(self):
def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "toxic")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
label_list = [
......@@ -47,7 +47,8 @@ class Toxic(BaseNLPDataset):
test_file="test.csv",
label_file=None,
label_list=label_list,
)
tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
......@@ -64,7 +65,10 @@ class Toxic(BaseNLPDataset):
if __name__ == "__main__":
ds = Toxic()
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = Toxic(tokenizer=tokenizer, max_seq_len=10)
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
......@@ -75,3 +79,6 @@ if __name__ == "__main__":
for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
......@@ -25,19 +25,19 @@ import csv
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/XNLI-lan.tar.gz"
class XNLI(BaseNLPDataset):
class XNLI(TextClassificationDataset):
"""
Please refer to
https://arxiv.org/pdf/1809.05053.pdf
for more information
"""
def __init__(self, language='zh'):
def __init__(self, language='zh', tokenizer=None, max_seq_len=None):
if language not in [
"ar", "bg", "de", "el", "en", "es", "fr", "hi", "ru", "sw",
"th", "tr", "ur", "vi", "zh"
......@@ -55,6 +55,8 @@ class XNLI(BaseNLPDataset):
test_file="%s_test.tsv" % language,
label_file=None,
label_list=["neutral", "contradiction", "entailment"],
tokenizer=tokenizer,
max_seq_len=max_seq_len,
)
def _read_file(self, input_file, phase=None):
......@@ -74,7 +76,10 @@ class XNLI(BaseNLPDataset):
if __name__ == "__main__":
ds = XNLI()
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = XNLI(tokenizer=tokenizer, max_seq_len=20)
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
......
......@@ -167,7 +167,7 @@ class DefaultStrategy(object):
self.optimizer = fluid.optimizer.Adam(
learning_rate=self.learning_rate, **kwargs)
def execute(self, loss, data_reader, config, dev_count):
def execute(self, loss, max_train_steps):
if self.optimizer is not None:
self.optimizer.minimize(loss)
else:
......@@ -456,26 +456,9 @@ class CombinedStrategy(DefaultStrategy):
"weight_decay"] * scheduled_lr
fluid.layers.assign(output=param, input=updated_param)
def execute(self, loss, data_reader, config, dev_count):
def execute(self, loss, max_train_steps):
# base information
self.main_program = loss.block.program
self.config = config
# self.num_examples = {'train': -1, 'dev': -1, 'test': -1} before data_generator
data_reader.data_generator(
batch_size=config.batch_size, phase='train', shuffle=True)
num_train_examples = data_reader.num_examples['train']
max_train_steps = config.num_epoch * num_train_examples // config.batch_size // dev_count
try:
# nlp_reader
_in_tokens = data_reader.in_tokens
if _in_tokens:
max_train_steps *= data_reader.max_seq_len
except:
# cv_reader without .in_tokens and .max_seq_len
pass
if self.scheduler["discriminative"]["blocks"] > 0 or self.scheduler[
"gradual_unfreeze"]["blocks"] > 0:
......@@ -494,8 +477,7 @@ class CombinedStrategy(DefaultStrategy):
self.regularization_handler(loss, scheduled_lr)
logger.info(self.__str__())
return scheduled_lr, max_train_steps
return scheduled_lr
def exclude_from_weight_decay(self, name):
if name.find("layer_norm") > -1:
......
......@@ -35,6 +35,7 @@ import paddle.fluid as fluid
from visualdl import LogWriter
import paddlehub as hub
from paddlehub.reader.nlp_reader import BaseNLPReader
from paddlehub.common.paddle_helper import dtype_map, clone_program
from paddlehub.common.utils import mkdir
from paddlehub.common.dir import tmp_dir
......@@ -84,7 +85,7 @@ class RunEnv(object):
self.start_program = None
self.main_program_compiled = None
self.py_reader = None
self.reader = None
self.generator = None
self.loss = None
self.labels = None
self.metrics = None
......@@ -260,8 +261,8 @@ class BaseTask(object):
BaseTask is the base class of all the task. It will complete the building of all the running environment.
Args:
feed_list (list): the inputs name
data_reader (object): data reader for the task
feed_list (list): the inputs name. Deprecated in paddlehub v1.8.
data_reader (object): data reader for the task. Deprecated in paddlehub v1.8.
main_program (object): the customized main_program, default None
startup_program (object): the customized startup_program, default None
config (object): the config for the task, default None
......@@ -269,16 +270,13 @@ class BaseTask(object):
"""
def __init__(self,
feed_list,
data_reader,
dataset=None,
feed_list=None,
data_reader=None,
main_program=None,
startup_program=None,
config=None,
metrics_choices="default"):
# base item
self._base_data_reader = data_reader
self._base_feed_list = feed_list
# metrics item
self.best_score = -999
if metrics_choices == "default":
......@@ -293,7 +291,6 @@ class BaseTask(object):
if main_program is None:
self._base_main_program = clone_program(
fluid.default_main_program(), for_test=False)
else:
self._base_main_program = clone_program(
main_program, for_test=False)
......@@ -344,6 +341,23 @@ class BaseTask(object):
# set default phase
self.enter_phase("train")
self.dataset = dataset
if dataset:
self._label_list = dataset.get_labels()
# Compatible code for usage deprecated in paddlehub v1.8.
self._base_data_reader = data_reader
self._base_feed_list = feed_list
if isinstance(data_reader, BaseNLPReader):
self._compatible_mode = True
logger.warning(
"PaddleHub v1.8 has deprecated the reader and feed_list parameters in the nlp Task. We provided an easier usage, "
"in which you can use your tokenizer to preprocess dataset and run task in a clear flow. "
"New demo see https://github.com/PaddlePaddle/PaddleHub/blob/release/v1.8/demo/text_classification/text_cls.py"
)
else:
self._compatible_mode = False
@contextlib.contextmanager
def phase_guard(self, phase):
self.enter_phase(phase)
......@@ -420,9 +434,29 @@ class BaseTask(object):
with fluid.program_guard(self.env.main_program,
self._base_startup_program):
with fluid.unique_name.guard(self.env.UNG):
self.scheduled_lr, self.max_train_steps = self.config.strategy.execute(
self.loss, self._base_data_reader, self.config,
self.device_count)
if self._compatible_mode:
# This branch is compatible code for usage deprecated in paddlehub v1.8.
self._base_data_reader.data_generator(
batch_size=self.config.batch_size,
phase='train',
shuffle=True)
num_train_examples = self._base_data_reader.num_examples[
'train']
try:
# nlp_reader
_in_tokens = self._base_data_reader.in_tokens
if _in_tokens:
num_train_examples *= self._base_data_reader.max_seq_len
except:
# cv_reader without .in_tokens and .max_seq_len
pass
else:
num_train_examples = len(
self.dataset.get_train_records())
self.max_train_steps = self.config.num_epoch * num_train_examples // self.config.batch_size // self.device_count
self.scheduled_lr = self.config.strategy.execute(
self.loss, self.max_train_steps)
if self.is_train_phase:
loss_name = self.env.loss.name
......@@ -529,17 +563,40 @@ class BaseTask(object):
return self.main_program
@property
def reader(self):
def generator(self):
if self._compatible_mode:
if self.is_predict_phase:
data = self._predict_data
else:
data = None
self.env.reader = self._base_data_reader.data_generator(
self.env.generator = self._base_data_reader.data_generator(
batch_size=self.config.batch_size,
phase=self.phase,
data=data,
return_list=not self.config.use_pyreader)
return self.env.reader
else:
def data_generator(records):
def wrapper():
for record in records:
values = []
for feed_name in self.feed_list:
values.append(record[feed_name])
yield values
return wrapper
if self.is_predict_phase:
records = self._predict_data
else:
if self.is_train_phase:
shuffle = True
else:
shuffle = False
records = self.dataset.get_records(
phase=self.phase, shuffle=shuffle)
self.env.generator = data_generator(records)
return self.env.generator
@property
def loss(self):
......@@ -580,13 +637,30 @@ class BaseTask(object):
@property
def feed_list(self):
if self._compatible_mode:
feed_list = [varname for varname in self._base_feed_list]
if self.is_train_phase or self.is_test_phase:
feed_list += [label.name for label in self.labels]
else:
if not self.env.is_inititalized:
self._build_env()
if self._predict_data:
feed_list = list(self._predict_data[0].keys())
else:
feed_list = self.dataset.get_feed_list(self.phase)
feed_list = [
feed_name for feed_name in feed_list
if feed_name in self.main_program.global_block().vars
]
return feed_list
@property
def feed_var_list(self):
if not self.env.is_inititalized:
self._build_env()
vars = self.main_program.global_block().vars
return [vars[varname] for varname in self.feed_list]
......@@ -890,13 +964,20 @@ class BaseTask(object):
self.env.current_epoch += 1
# Final evaluation
if self._base_data_reader.get_dev_examples() != []:
if self._compatible_mode:
dev_examples = self._base_data_reader.get_dev_examples()
test_examples = self._base_data_reader.get_test_examples()
else:
dev_examples = self.dataset.get_dev_examples()
test_examples = self.dataset.get_test_examples()
if dev_examples != []:
# Warning: DO NOT use self.eval(phase="dev", load_best_model=True) during training.
# It will cause trainer unable to continue training from checkpoint after eval.
# More important, The model should evaluate current performance during training.
self.eval(phase="dev")
if self._base_data_reader.get_test_examples() != []:
if test_examples != []:
self.eval(phase="test", load_best_model=True)
# Save checkpoint after finetune
self.save_checkpoint()
......@@ -957,17 +1038,41 @@ class BaseTask(object):
global_run_states = []
period_run_states = []
for run_step, batch in enumerate(self.reader(), start=1):
feed_var_shape = []
feed_var_type = []
for var in self.feed_var_list:
feed_var_shape.append(var.shape)
feed_var_type.append(dtype_map[var.dtype])
if self._compatible_mode:
data_reader = self.generator
else:
data_reader = paddle.batch(
self.generator, batch_size=self.config.batch_size)
for batch in data_reader():
if self._compatible_mode and not self.config.use_pyreader:
# if not use pyreader, the nlp_reader return [batch]
batch = batch[0]
step_run_state = RunState(len(self.fetch_list))
step_run_state.run_step = 1
num_batch_examples = len(batch)
if not self.config.use_pyreader:
# if use pyreader, the nlp_reader return [batch]
batch = batch[0]
batch = [fluid.core.PaddleTensor(data) for data in batch]
fetch_result = self._predictor.run(batch)
# Preocessing data to the suitable shape and type for the model
processed_batch = [[] for i in range(len(self.feed_list))]
if self._compatible_mode:
processed_batch = batch
else:
for sample in batch:
for i, data in enumerate(sample):
processed_batch[i].append(data)
tensor_batch = [[] for i in range(len(self.feed_list))]
for i in range(len(processed_batch)):
processed_batch[i] = np.array(processed_batch[i]).reshape(
feed_var_shape[i]).astype(feed_var_type[i])
tensor_batch[i] = fluid.core.PaddleTensor(processed_batch[i])
fetch_result = self._predictor.run(tensor_batch)
for index, result in enumerate(fetch_result):
step_run_state.run_results[index] = result.as_ndarray()
step_run_state.run_examples += num_batch_examples
......@@ -978,18 +1083,23 @@ class BaseTask(object):
global_run_states += period_run_states
return global_run_states
def predict(self,
data,
def predict(
self,
data=None,
label_list=None,
load_best_model=True,
return_result=False,
accelerate_mode=True):
accelerate_mode=True,
):
"""
make prediction for the input data.
Args:
data (list): the data will be predicted.
data (list): the data will be predicted. Its element should be a record when the task is initialized without data_reader param,
or a plaintext string list when the task is initialized with data_reader param (deprecated in paddlehub v1.8).
label_list (list): the label list, used to proprocess the output.
load_best_model (bool): load the best model or not
return_result (bool): return a readable result or just the raw run result
return_result (bool): return a readable result or just the raw run result. Always True when the task is not initialized with data_reader param.
accelerate_mode (bool): use high-performance predictor or not
Returns:
......@@ -1005,6 +1115,7 @@ class BaseTask(object):
with self.phase_guard(phase="predict"):
self._predict_data = data
self._label_list = label_list
self._predict_start_event()
if load_best_model:
......@@ -1020,7 +1131,7 @@ class BaseTask(object):
self._predict_end_event(run_states)
self._predict_data = None
if return_result:
if return_result or not self._compatible_mode:
return self._postprocessing(run_states)
return run_states
......@@ -1057,20 +1168,34 @@ class BaseTask(object):
capacity=64,
use_double_buffer=True,
iterable=True)
if self._compatible_mode:
data_reader = data_loader.set_batch_generator(
self.reader, places=self.places)
self.generator, places=self.places)
else:
data_reader = data_loader.set_sample_generator(
self.generator,
places=self.places,
batch_size=self.config.batch_size,
drop_last=True)
else:
data_feeder = fluid.DataFeeder(
feed_list=self.feed_list, place=self.place)
if self._compatible_mode:
data_reader = data_feeder.decorate_reader(
self.generator,
multi_devices=self.config.use_data_parallel,
drop_last=True)
else:
data_reader = data_feeder.decorate_reader(
self.reader,
paddle.batch(
self.generator, batch_size=self.config.batch_size),
multi_devices=self.config.use_data_parallel,
drop_last=True)
global_run_states = []
period_run_states = []
for run_step, batch in enumerate(data_reader(), start=1):
for batch in data_reader():
step_run_state = RunState(len(self.fetch_list))
step_run_state.run_step = 1
num_batch_examples = len(batch)
......@@ -1107,6 +1232,5 @@ class BaseTask(object):
return global_run_states
def __repr__(self):
return "Task: %s with metrics_choices: %s, reader: %s, %s" % (
self.__class__.__name__, self.metrics_choices,
self._base_data_reader.__class__.__name__, self.config)
return "Task: %s with metrics_choices: %s, %s" % (
self.__class__.__name__, self.metrics_choices, self.config)
......@@ -19,13 +19,12 @@ from __future__ import print_function
from collections import OrderedDict
import numpy as np
import paddle
import paddle.fluid as fluid
import time
from paddlehub.common.logger import logger
from paddlehub.finetune.evaluate import calculate_f1_np, matthews_corrcoef
from paddlehub.reader.nlp_reader import ClassifyReader
from paddlehub.reader.nlp_reader import ClassifyReader, LACClassifyReader
import paddlehub.network as net
from .base_task import BaseTask
......@@ -35,8 +34,9 @@ class ClassifierTask(BaseTask):
def __init__(self,
feature,
num_classes,
feed_list,
data_reader,
dataset=None,
feed_list=None,
data_reader=None,
startup_program=None,
config=None,
hidden_units=None,
......@@ -46,6 +46,7 @@ class ClassifierTask(BaseTask):
main_program = feature.block.program
super(ClassifierTask, self).__init__(
dataset=dataset,
data_reader=data_reader,
main_program=main_program,
feed_list=feed_list,
......@@ -109,7 +110,7 @@ class ClassifierTask(BaseTask):
run_examples += run_state.run_examples
run_step += run_state.run_step
loss_sum += np.mean(
run_state.run_results[-2]) * run_state.run_examples
run_state.run_results[-1]) * run_state.run_examples
acc_sum += np.mean(
run_state.run_results[2]) * run_state.run_examples
np_labels = run_state.run_results[0]
......@@ -140,20 +141,28 @@ class ClassifierTask(BaseTask):
return scores, avg_loss, run_speed
def _postprocessing(self, run_states):
if self._compatible_mode:
try:
id2label = {
val: key
for key, val in self._base_data_reader.label_map.items()
}
label_list = list(self._base_data_reader.label_map.keys())
except:
raise Exception(
"ImageClassificationDataset does not support postprocessing, please use BaseCVDataset instead"
)
else:
if self._label_list:
label_list = self._label_list
else:
logger.warning(
"Fail to postprocess the predict output. Please set label_list parameter in predict function or initialize the task with dataset parameter."
)
return run_states
results = []
for batch_state in run_states:
batch_result = batch_state.run_results
batch_infer = np.argmax(batch_result[0], axis=1)
results += [id2label[sample_infer] for sample_infer in batch_infer]
results += [
label_list[sample_infer] for sample_infer in batch_infer
]
return results
......@@ -166,10 +175,12 @@ class TextClassifierTask(ClassifierTask):
It will use full-connect layer with softmax activation function to classify texts.
"""
def __init__(self,
def __init__(
self,
num_classes,
feed_list,
data_reader,
dataset=None,
feed_list=None, # Deprecated
data_reader=None, # Deprecated
feature=None,
token_feature=None,
network=None,
......@@ -180,8 +191,8 @@ class TextClassifierTask(ClassifierTask):
"""
Args:
num_classes: total labels of the text classification task.
feed_list(list): the variable name that will be feeded to the main program
data_reader(object): data reader for the task. It must be one of ClassifyReader and LACClassifyReader.
feed_list(list): the variable name that will be feeded to the main program, Deprecated in paddlehub v1.8.
data_reader(object): data reader for the task. It must be one of ClassifyReader and LACClassifyReader, Deprecated in paddlehub v1.8..
feature(Variable): the `feature` will be used to classify texts. It must be the sentence-level feature, shape as [-1, emb_size]. `Token_feature` and `feature` couldn't be setted at the same time. One of them must be setted as not None. Default None.
token_feature(Variable): the `feature` will be used to connect the pre-defined network. It must be the token-level feature, shape as [-1, seq_len, emb_size]. Default None.
network(str): the pre-defined network. Choices: 'bilstm', 'bow', 'cnn', 'dpcnn', 'gru' and 'lstm'. Default None. If network is setted, then `token_feature` must be setted and `feature` must be None.
......@@ -193,12 +204,12 @@ class TextClassifierTask(ClassifierTask):
"""
if (not feature) and (not token_feature):
logger.error(
'Both token_feature and feature are None, one of them must be setted.'
'Both token_feature and feature are None, one of them must be set.'
)
exit(1)
elif feature and token_feature:
logger.error(
'Both token_feature and feature are setted. One should be setted, the other should be None.'
'Both token_feature and feature are set. One should be set, the other should be None.'
)
exit(1)
......@@ -226,6 +237,7 @@ class TextClassifierTask(ClassifierTask):
metrics_choices = ["acc"]
super(TextClassifierTask, self).__init__(
dataset=dataset,
data_reader=data_reader,
feature=feature if feature else token_feature,
num_classes=num_classes,
......@@ -236,16 +248,14 @@ class TextClassifierTask(ClassifierTask):
metrics_choices=metrics_choices)
def _build_net(self):
if isinstance(self._base_data_reader, ClassifyReader):
# ClassifyReader will return the seqence length of an input text
if not isinstance(self._base_data_reader, LACClassifyReader):
# LACClassifyReader wont return the seqence length, while Dataset with tokenizer and ClassifyReader will.
self.seq_len = fluid.layers.data(
name="seq_len", shape=[1], dtype='int64', lod_level=0)
self.seq_len_used = fluid.layers.squeeze(self.seq_len, axes=[1])
# unpad the token_feature
unpad_feature = fluid.layers.sequence_unpad(
self.feature, length=self.seq_len_used)
if self.network:
# add pre-defined net
net_func = getattr(net.classification, self.network)
......@@ -253,10 +263,15 @@ class TextClassifierTask(ClassifierTask):
# deepcnn network is no need to unpad
cls_feats = net_func(
self.feature, emb_dim=self.feature.shape[-1])
else:
if self._compatible_mode and isinstance(self._base_data_reader,
LACClassifyReader):
cls_feats = net_func(self.feature)
else:
cls_feats = net_func(unpad_feature)
logger.info(
"%s has been added in the TextClassifierTask!" % self.network)
if self.is_train_phase:
logger.info("%s has been added in the TextClassifierTask!" %
self.network)
else:
# not use pre-defined net but to use fc net
cls_feats = fluid.layers.dropout(
......@@ -286,12 +301,15 @@ class TextClassifierTask(ClassifierTask):
@property
def feed_list(self):
if self._compatible_mode:
feed_list = [varname for varname in self._base_feed_list]
if isinstance(self._base_data_reader, ClassifyReader):
# ClassifyReader will return the seqence length of an input text
feed_list += [self.seq_len.name]
if self.is_train_phase or self.is_test_phase:
feed_list += [self.labels[0].name]
else:
feed_list = super(TextClassifierTask, self).feed_list
return feed_list
@property
......@@ -303,11 +321,10 @@ class TextClassifierTask(ClassifierTask):
]
else:
# predict phase
if isinstance(self._base_data_reader, LACClassifyReader):
fetch_list = [self.outputs[0].name]
if isinstance(self._base_data_reader, ClassifyReader):
# to avoid save_inference_model to prune seq_len variable
fetch_list += [self.seq_len.name]
else:
fetch_list = [self.outputs[0].name, self.seq_len.name]
return fetch_list
......@@ -316,8 +333,9 @@ class MultiLabelClassifierTask(ClassifierTask):
def __init__(self,
feature,
num_classes,
feed_list,
data_reader,
dataset=None,
feed_list=None,
data_reader=None,
startup_program=None,
config=None,
hidden_units=None,
......@@ -325,8 +343,8 @@ class MultiLabelClassifierTask(ClassifierTask):
if metrics_choices == "default":
metrics_choices = ["auc"]
main_program = feature.block.program
super(MultiLabelClassifierTask, self).__init__(
dataset=dataset,
data_reader=data_reader,
feature=feature,
num_classes=num_classes,
......@@ -335,7 +353,10 @@ class MultiLabelClassifierTask(ClassifierTask):
config=config,
hidden_units=hidden_units,
metrics_choices=metrics_choices)
if self._compatible_mode:
self.class_name = list(data_reader.label_map.keys())
else:
self.class_name = self._label_list
def _build_net(self):
cls_feats = fluid.layers.dropout(
......@@ -428,13 +449,22 @@ class MultiLabelClassifierTask(ClassifierTask):
def _postprocessing(self, run_states):
results = []
if self._compatible_mode:
label_list = list(self._base_data_reader.label_map.keys())
else:
if self._label_list:
label_list = self._label_list
else:
logger.warning(
"Fail to postprocess the predict output. Please set label_list parameter in predict function or initialize the task with dataset parameter."
)
return run_states
for batch_state in run_states:
batch_result = batch_state.run_results
for sample_id in range(len(batch_result[0])):
sample_result = []
for category_id in range(
self._base_data_reader.dataset.num_labels):
for category_id in range(len(label_list)):
sample_category_prob = batch_result[category_id][sample_id]
sample_category_value = np.argmax(sample_category_prob)
sample_result.append(
......
......@@ -18,23 +18,22 @@ from __future__ import division
from __future__ import print_function
import time
import os
import collections
import math
import six
import json
from collections import OrderedDict
import io
from tqdm import tqdm
import numpy as np
import paddle.fluid as fluid
from .base_task import BaseTask
from paddlehub.common.logger import logger
from paddlehub.reader import tokenization
from paddlehub.finetune.evaluator import squad1_evaluate
from paddlehub.finetune.evaluator import squad2_evaluate
from paddlehub.finetune.evaluator import cmrc2018_evaluate
from .base_task import BaseTask
def _get_best_indexes(logits, n_best_size):
......@@ -193,6 +192,8 @@ def get_predictions(all_examples, all_features, all_results, n_best_size,
all_nbest_json = collections.OrderedDict()
scores_diff_json = collections.OrderedDict()
logger.info("Post processing...")
with tqdm(total=len(all_examples)) as process_bar:
for (example_index, example) in enumerate(all_examples):
features = example_index_to_features[example_index]
......@@ -209,13 +210,14 @@ def get_predictions(all_examples, all_features, all_results, n_best_size,
% feature.unique_id)
continue
result = unique_id_to_result[feature.unique_id]
start_indexes = _get_best_indexes(result.start_logits, n_best_size)
start_indexes = _get_best_indexes(result.start_logits,
n_best_size)
end_indexes = _get_best_indexes(result.end_logits, n_best_size)
# if we could have irrelevant answers, get the min score of irrelevant
if version_2_with_negative:
feature_null_score = result.start_logits[0] + result.end_logits[
0]
feature_null_score = result.start_logits[
0] + result.end_logits[0]
if feature_null_score < score_null:
score_null = feature_null_score
min_null_feature_index = feature_index
......@@ -235,7 +237,8 @@ def get_predictions(all_examples, all_features, all_results, n_best_size,
continue
if end_index not in feature.token_to_orig_map:
continue
if not feature.token_is_max_context.get(start_index, False):
if not feature.token_is_max_context.get(
start_index, False):
continue
if end_index < start_index:
continue
......@@ -294,8 +297,8 @@ def get_predictions(all_examples, all_features, all_results, n_best_size,
else:
orig_text = "".join(orig_tokens)
final_text = get_final_text(tok_text, orig_text, do_lower_case,
is_english)
final_text = get_final_text(tok_text, orig_text,
do_lower_case, is_english)
if final_text in seen_predictions:
continue
......@@ -322,7 +325,8 @@ def get_predictions(all_examples, all_features, all_results, n_best_size,
# just create a nonce prediction in this case to avoid failure.
if not nbest:
nbest.append(
_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
_NbestPrediction(
text="empty", start_logit=0.0, end_logit=0.0))
assert len(nbest) >= 1
......@@ -361,15 +365,16 @@ def get_predictions(all_examples, all_features, all_results, n_best_size,
all_predictions[example.qas_id] = best_non_null_entry.text
all_nbest_json[example.qas_id] = nbest_json
process_bar.update(1)
return all_predictions, all_nbest_json, scores_diff_json
class ReadingComprehensionTask(BaseTask):
def __init__(self,
feature,
feed_list,
data_reader,
dataset=None,
feed_list=None,
data_reader=None,
startup_program=None,
config=None,
metrics_choices=None,
......@@ -379,7 +384,9 @@ class ReadingComprehensionTask(BaseTask):
max_answer_length=30):
main_program = feature.block.program
self.data_reader = data_reader
super(ReadingComprehensionTask, self).__init__(
dataset=dataset,
data_reader=data_reader,
main_program=main_program,
feed_list=feed_list,
......@@ -387,7 +394,6 @@ class ReadingComprehensionTask(BaseTask):
config=config,
metrics_choices=metrics_choices)
self.feature = feature
self.data_reader = data_reader
self.sub_task = sub_task.lower()
self.version_2_with_negative = (self.sub_task == "squad2.0")
if self.sub_task in ["squad2.0", "squad"]:
......@@ -407,10 +413,10 @@ class ReadingComprehensionTask(BaseTask):
"RawResult", ["unique_id", "start_logits", "end_logits"])
def _build_net(self):
self.unique_ids = fluid.layers.data(
name="unique_ids", shape=[-1, 1], lod_level=0, dtype="int64")
self.unique_id = fluid.layers.data(
name="unique_id", shape=[-1, 1], lod_level=0, dtype="int64")
# to avoid memory optimization
_ = fluid.layers.assign(self.unique_ids)
_ = fluid.layers.assign(self.unique_id)
logits = fluid.layers.fc(
input=self.feature,
size=2,
......@@ -432,24 +438,24 @@ class ReadingComprehensionTask(BaseTask):
return [start_logits, end_logits, num_seqs]
def _add_label(self):
start_positions = fluid.layers.data(
name="start_positions", shape=[-1, 1], lod_level=0, dtype="int64")
end_positions = fluid.layers.data(
name="end_positions", shape=[-1, 1], lod_level=0, dtype="int64")
return [start_positions, end_positions]
start_position = fluid.layers.data(
name="start_position", shape=[-1, 1], lod_level=0, dtype="int64")
end_position = fluid.layers.data(
name="end_position", shape=[-1, 1], lod_level=0, dtype="int64")
return [start_position, end_position]
def _add_loss(self):
start_positions = self.labels[0]
end_positions = self.labels[1]
start_position = self.labels[0]
end_position = self.labels[1]
start_logits = self.outputs[0]
end_logits = self.outputs[1]
start_loss = fluid.layers.softmax_with_cross_entropy(
logits=start_logits, label=start_positions)
logits=start_logits, label=start_position)
start_loss = fluid.layers.mean(x=start_loss)
end_loss = fluid.layers.softmax_with_cross_entropy(
logits=end_logits, label=end_positions)
logits=end_logits, label=end_position)
end_loss = fluid.layers.mean(x=end_loss)
total_loss = (start_loss + end_loss) / 2.0
return total_loss
......@@ -459,22 +465,25 @@ class ReadingComprehensionTask(BaseTask):
@property
def feed_list(self):
if self._compatible_mode:
feed_list = [varname for varname in self._base_feed_list
] + [self.unique_ids.name]
] + [self.unique_id.name]
if self.is_train_phase or self.is_test_phase:
feed_list += [label.name for label in self.labels]
else:
feed_list = super(ReadingComprehensionTask, self).feed_list
return feed_list
@property
def fetch_list(self):
if self.is_train_phase or self.is_test_phase:
return [
self.loss.name, self.outputs[-1].name, self.unique_ids.name,
self.loss.name, self.outputs[-1].name, self.unique_id.name,
self.outputs[0].name, self.outputs[1].name
]
elif self.is_predict_phase:
return [
self.unique_ids.name,
self.unique_id.name,
] + [output.name for output in self.outputs]
def _calculate_metrics(self, run_states):
......@@ -503,11 +512,17 @@ class ReadingComprehensionTask(BaseTask):
run_time_used = time.time() - run_states[0].run_time_begin
run_speed = run_step / run_time_used
avg_loss = np.sum(total_cost) / np.sum(total_num_seqs)
scores = OrderedDict()
scores = collections.OrderedDict()
# If none of metrics has been implemented, loss will be used to evaluate.
if self.is_test_phase:
if self._compatible_mode:
all_examples = self.data_reader.all_examples[self.phase]
all_features = self.data_reader.all_features[self.phase]
dataset = self.data_reader.dataset
else:
all_examples = self.dataset.get_examples(self.phase)
all_features = self.dataset.get_features(self.phase)
dataset = self.dataset
all_predictions, all_nbest_json, scores_diff_json = get_predictions(
all_examples=all_examples,
all_features=all_features,
......@@ -519,28 +534,23 @@ class ReadingComprehensionTask(BaseTask):
null_score_diff_threshold=self.null_score_diff_threshold,
is_english=self.is_english)
if self.phase == 'val' or self.phase == 'dev':
with io.open(
self.data_reader.dataset.dev_path, 'r',
encoding="utf8") as dataset_file:
dataset_json = json.load(dataset_file)
dataset = dataset_json['data']
dataset_path = dataset.dev_path
elif self.phase == 'test':
with io.open(
self.data_reader.dataset.test_path, 'r',
encoding="utf8") as dataset_file:
dataset_json = json.load(dataset_file)
dataset = dataset_json['data']
dataset_path = dataset.test_path
else:
raise Exception("Error phase: %s when runing _calculate_metrics"
% self.phase)
with io.open(dataset_path, 'r', encoding="utf8") as dataset_file:
dataset_json = json.load(dataset_file)
data = dataset_json['data']
if self.sub_task == "squad":
scores = squad1_evaluate.evaluate(dataset, all_predictions)
scores = squad1_evaluate.evaluate(data, all_predictions)
elif self.sub_task == "squad2.0":
scores = squad2_evaluate.evaluate(dataset, all_predictions,
scores = squad2_evaluate.evaluate(data, all_predictions,
scores_diff_json)
elif self.sub_task in ["cmrc2018", "drcd"]:
scores = cmrc2018_evaluate.get_eval(dataset, all_predictions)
scores = cmrc2018_evaluate.get_eval(data, all_predictions)
return scores, avg_loss, run_speed
def _postprocessing(self, run_states):
......@@ -558,8 +568,12 @@ class ReadingComprehensionTask(BaseTask):
unique_id=unique_id,
start_logits=start_logits,
end_logits=end_logits))
if self._compatible_mode:
all_examples = self.data_reader.all_examples[self.phase]
all_features = self.data_reader.all_features[self.phase]
else:
all_examples = self.dataset.get_examples(self.phase)
all_features = self.dataset.get_features(self.phase)
all_predictions, all_nbest_json, scores_diff_json = get_predictions(
all_examples=all_examples,
all_features=all_features,
......
......@@ -29,8 +29,9 @@ from .base_task import BaseTask
class RegressionTask(BaseTask):
def __init__(self,
feature,
feed_list,
data_reader,
dataset=None,
feed_list=None,
data_reader=None,
startup_program=None,
config=None,
hidden_units=None,
......@@ -40,6 +41,7 @@ class RegressionTask(BaseTask):
main_program = feature.block.program
super(RegressionTask, self).__init__(
dataset=dataset,
data_reader=data_reader,
main_program=main_program,
feed_list=feed_list,
......
......@@ -21,10 +21,9 @@ import time
from collections import OrderedDict
import numpy as np
import paddle
import paddle.fluid as fluid
from paddlehub.finetune.evaluate import chunk_eval, calculate_f1
from paddlehub.common.utils import version_compare
from paddlehub.common.logger import logger
from .base_task import BaseTask
......@@ -33,8 +32,9 @@ class SequenceLabelTask(BaseTask):
feature,
max_seq_len,
num_classes,
feed_list,
data_reader,
dataset=None,
feed_list=None,
data_reader=None,
startup_program=None,
config=None,
metrics_choices="default",
......@@ -46,6 +46,7 @@ class SequenceLabelTask(BaseTask):
main_program = feature.block.program
super(SequenceLabelTask, self).__init__(
dataset=dataset,
data_reader=data_reader,
main_program=main_program,
feed_list=feed_list,
......@@ -199,11 +200,14 @@ class SequenceLabelTask(BaseTask):
@property
def feed_list(self):
if self._compatible_mode:
feed_list = [varname for varname in self._base_feed_list]
if self.is_train_phase or self.is_test_phase:
feed_list += [self.labels[0].name, self.seq_len.name]
else:
feed_list += [self.seq_len.name]
else:
feed_list = super(SequenceLabelTask, self).feed_list
return feed_list
@property
......@@ -215,10 +219,22 @@ class SequenceLabelTask(BaseTask):
return [output.name for output in self.outputs]
def _postprocessing(self, run_states):
if self._compatible_mode:
id2label = {
val: key
for key, val in self._base_data_reader.label_map.items()
}
else:
if self._label_list:
id2label = {}
for index, label in enumerate(self._label_list):
id2label[index] = label
else:
logger.warning(
"Fail to postprocess the predict output. Please set label_list parameter in predict function or initialize the task with dataset parameter."
)
return run_states
results = []
for batch_states in run_states:
batch_results = batch_states.run_results
......
......@@ -688,11 +688,13 @@ class Features(object):
s = ""
s += "unique_id: %s " % self.unique_id
s += "example_index: %s " % self.example_index
s += "doc_span_index: %s" % self.doc_span_index
s += "tokens: %s" % self.tokens
s += "token_to_orig_map %s" % self.token_to_orig_map
s += "token_is_max_context %s" % self.token_is_max_context
s += "start_position: %s " % self.start_position
s += "end_position: %s " % self.end_position
s += "is_impossible: %s " % self.is_impossible
# s += "tokens: %s" % self.tokens
# s += "token_to_orig_map %s" % self.token_to_orig_map
return s
......
......@@ -140,29 +140,6 @@ class FullTokenizer(object):
return convert_by_vocab(self.inv_vocab, ids)
class CharTokenizer(object):
"""Runs end-to-end tokenziation."""
def __init__(self, vocab_file, do_lower_case=True):
self.vocab = load_vocab(vocab_file)
self.inv_vocab = {v: k for k, v in self.vocab.items()}
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
def tokenize(self, text):
split_tokens = []
for token in text.lower().split(" "):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)
return split_tokens
def convert_tokens_to_ids(self, tokens):
return convert_by_vocab(self.vocab, tokens)
def convert_ids_to_tokens(self, ids):
return convert_by_vocab(self.inv_vocab, ids)
class WSSPTokenizer(object):
def __init__(self, vocab_file, sp_model_dir, word_dict, ws=True,
lower=True):
......
from .bert_tokenizer import BertTokenizer
from .bert_tokenizer import ErnieTinyTokenizer
此差异已折叠。
from collections import OrderedDict
import unicodedata
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = {}
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n").split("\t")[0]
vocab[token] = index
return vocab
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
def is_whitespace(char):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
cat = unicodedata.category(char)
if cat == "Zs":
return True
return False
def is_control(char):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if char == "\t" or char == "\n" or char == "\r":
return False
cat = unicodedata.category(char)
if cat.startswith("C"):
return True
return False
def is_punctuation(char):
"""Checks whether `chars` is a punctuation character."""
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (
cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
return True
cat = unicodedata.category(char)
if cat.startswith("P"):
return True
return False
def is_chinese_char(char):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
cp = ord(char)
if ((cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF) #
or (cp >= 0x20000 and cp <= 0x2A6DF) #
or (cp >= 0x2A700 and cp <= 0x2B73F) #
or (cp >= 0x2B740 and cp <= 0x2B81F) #
or (cp >= 0x2B820 and cp <= 0x2CEAF) #
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F) #
): #
return True
return False
......@@ -8,6 +8,7 @@ visualdl >= 2.0.0b
cma >= 2.7.0
sentencepiece
colorlog
tqdm
# pandas no longer support python2 in version 0.25 and above
pandas ; python_version >= "3"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册