未验证 提交 a1cf32cd 编写于 作者: K kinghuin 提交者: GitHub

Tokenizer refactor (#677)

上级 a253ecaa
......@@ -39,18 +39,17 @@ if __name__ == '__main__':
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use MultiLabelReader to read dataset
dataset = hub.dataset.Toxic()
reader = hub.reader.MultiLabelClassifyReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len)
# Setup feed list for data feeder
feed_list = [
inputs["input_ids"].name, inputs["position_ids"].name,
inputs["segment_ids"].name, inputs["input_mask"].name
]
# Use the appropriate tokenizer to preprocess the data set
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
dataset = hub.dataset.Toxic(
tokenizer=tokenizer, max_seq_len=args.max_seq_len)
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
......@@ -72,9 +71,8 @@ if __name__ == '__main__':
# Define a classfication fine-tune task by PaddleHub's API
multi_label_cls_task = hub.MultiLabelClassifierTask(
data_reader=reader,
dataset=dataset,
feature=pooled_output,
feed_list=feed_list,
num_classes=dataset.num_labels,
config=config)
......
......@@ -45,20 +45,11 @@ if __name__ == '__main__':
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use MultiLabelReader to read dataset
# Download dataset and get its label list and label num
# If you just want labels information, you can omit its tokenizer parameter to avoid preprocessing the train set.
dataset = hub.dataset.Toxic()
reader = hub.reader.MultiLabelClassifyReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len)
# Setup feed list for data feeder
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
num_classes = dataset.num_labels
label_list = dataset.get_labels()
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
......@@ -75,20 +66,29 @@ if __name__ == '__main__':
# Define a classfication fine-tune task by PaddleHub's API
multi_label_cls_task = hub.MultiLabelClassifierTask(
data_reader=reader,
dataset=dataset,
feature=pooled_output,
feed_list=feed_list,
num_classes=dataset.num_labels,
config=config)
# Data to be predicted
data = [
[
"Yes you did. And you admitted to doing it. See the Warren Kinsella talk page."
],
[
"I asked you a question. We both know you have my page on your watch list, so are why are you playing games and making me formally ping you? Makin'Bacon"
],
"Yes you did. And you admitted to doing it. See the Warren Kinsella talk page.",
"I asked you a question. We both know you have my page on your watch list, so are why are you playing games and making me formally ping you? Makin'Bacon",
]
# Use the appropriate tokenizer to preprocess the data
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
print(multi_label_cls_task.predict(data=data, return_result=True))
encoded_data = [
tokenizer.encode(text=text, max_seq_len=args.max_seq_len)
for text in data
]
print(
multi_label_cls_task.predict(data=encoded_data, label_list=label_list))
......@@ -36,31 +36,28 @@ args = parser.parse_args()
if __name__ == '__main__':
# Load Paddlehub ERNIE pretrained model
module = hub.Module(name="ernie")
module = hub.Module(name="ernie_tiny")
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use ClassifyReader to read dataset
dataset = hub.dataset.NLPCC_DBQA()
reader = hub.reader.ClassifyReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len)
# Use the appropriate tokenizer to preprocess the data set
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
dataset = hub.dataset.NLPCC_DBQA(
tokenizer=tokenizer, max_seq_len=args.max_seq_len)
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"]
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Select fine-tune strategy, setup config and fine-tune
strategy = hub.AdamWeightDecayStrategy(
warmup_proportion=args.warmup_proportion,
......@@ -78,9 +75,8 @@ if __name__ == '__main__':
# Define a classfication fine-tune task by PaddleHub's API
cls_task = hub.TextClassifierTask(
data_reader=reader,
dataset=dataset,
feature=pooled_output,
feed_list=feed_list,
num_classes=dataset.num_labels,
config=config)
......
......@@ -39,30 +39,20 @@ args = parser.parse_args()
if __name__ == '__main__':
# loading Paddlehub ERNIE pretrained model
module = hub.Module(name="ernie")
module = hub.Module(name="ernie_tiny")
inputs, outputs, program = module.context(max_seq_len=args.max_seq_len)
# Sentence classification dataset reader
# Download dataset and get its label list and label num
# If you just want labels information, you can omit its tokenizer parameter to avoid preprocessing the train set.
dataset = hub.dataset.NLPCC_DBQA()
reader = hub.reader.ClassifyReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len)
num_classes = dataset.num_labels
label_list = dataset.get_labels()
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"]
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig(
use_data_parallel=False,
......@@ -73,9 +63,8 @@ if __name__ == '__main__':
# Define a classfication fine-tune task by PaddleHub's API
cls_task = hub.TextClassifierTask(
data_reader=reader,
dataset=dataset,
feature=pooled_output,
feed_list=feed_list,
num_classes=dataset.num_labels,
config=config)
......@@ -83,5 +72,18 @@ if __name__ == '__main__':
data = [["北京奥运博物馆的场景效果负责人是谁?", "主要承担奥运文物征集、保管、研究和爱国主义教育基地建设相关工作。"],
["北京奥运博物馆的场景效果负责人是谁", "于海勃,美国加利福尼亚大学教授 场景效果负责人 总设计师"],
["北京奥运博物馆的场景效果负责人是谁?", "洪麦恩,清华大学美术学院教授 内容及主展线负责人 总设计师"]]
print(cls_task.predict(data=data, return_result=True))
# Use the appropriate tokenizer to preprocess the data
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
encoded_data = [
tokenizer.encode(
text=text, text_pair=text_pair, max_seq_len=args.max_seq_len)
for text, text_pair in data
]
print(cls_task.predict(data=encoded_data, label_list=label_list))
......@@ -17,7 +17,6 @@
import argparse
import ast
import paddle.fluid as fluid
import paddlehub as hub
hub.common.logger.logger.setLevel("INFO")
......@@ -42,28 +41,23 @@ if __name__ == '__main__':
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use ReadingComprehensionReader to read dataset
# Use the appropriate tokenizer to preprocess the data set
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
# If you wanna load SQuAD 2.0 dataset, just set version_2_with_negative as True
dataset = hub.dataset.SQUAD(version_2_with_negative=False)
dataset = hub.dataset.SQUAD(
version_2_with_negative=False,
tokenizer=tokenizer,
max_seq_len=args.max_seq_len)
# dataset = hub.dataset.SQUAD(version_2_with_negative=True)
reader = hub.reader.ReadingComprehensionReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len,
doc_stride=128,
max_query_length=64)
seq_output = outputs["sequence_output"]
# Setup feed list for data feeder
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Select fine-tune strategy, setup config and fine-tune
strategy = hub.AdamWeightDecayStrategy(
weight_decay=args.weight_decay,
......@@ -72,7 +66,7 @@ if __name__ == '__main__':
# Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig(
eval_interval=300,
eval_interval=100,
use_data_parallel=args.use_data_parallel,
use_cuda=args.use_gpu,
num_epoch=args.num_epoch,
......@@ -82,9 +76,8 @@ if __name__ == '__main__':
# Define a reading comprehension fine-tune task by PaddleHub's API
reading_comprehension_task = hub.ReadingComprehensionTask(
data_reader=reader,
feature=seq_output,
feed_list=feed_list,
dataset=dataset,
feature=outputs["sequence_output"],
config=config,
sub_task="squad",
)
......
......@@ -20,12 +20,6 @@ from __future__ import print_function
import argparse
import ast
import numpy as np
import os
import time
import paddle
import paddle.fluid as fluid
import paddlehub as hub
# yapf: disable
......@@ -43,27 +37,11 @@ if __name__ == '__main__':
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use RegressionReader to read dataset
dataset = hub.dataset.GLUE("STS-B")
reader = hub.reader.RegressionReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len)
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"]
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig(
use_data_parallel=False,
......@@ -74,13 +52,22 @@ if __name__ == '__main__':
# Define a regression fine-tune task by PaddleHub's API
reg_task = hub.RegressionTask(
data_reader=reader,
feature=pooled_output,
feed_list=feed_list,
config=config,
)
# Data to be prdicted
data = [[d.text_a, d.text_b] for d in dataset.get_predict_examples()[:10]]
print(reg_task.predict(data=data, return_result=True))
# STS-B has provided the predict data, and the dataset has process it. If you want to process customized data,
# see the predict.py in text_classification demo
# Use the appropriate tokenizer to preprocess the data
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
dataset = hub.dataset.GLUE(
"STS-B", tokenizer=tokenizer, max_seq_len=args.max_seq_len)
encoded_data = dataset.get_predict_records()[:10]
print(reg_task.predict(data=encoded_data))
......@@ -17,7 +17,6 @@
import argparse
import ast
import paddle.fluid as fluid
import paddlehub as hub
# yapf: disable
......@@ -41,27 +40,24 @@ if __name__ == '__main__':
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use RegressionReader to read dataset
dataset = hub.dataset.GLUE("STS-B")
reader = hub.reader.RegressionReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len)
# Use the appropriate tokenizer to preprocess the data set
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
dataset = hub.dataset.GLUE(
"STS-B", tokenizer=tokenizer, max_seq_len=args.max_seq_len)
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"]
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Select fine-tune strategy, setup config and fine-tune
strategy = hub.AdamWeightDecayStrategy(
warmup_proportion=args.warmup_proportion,
......@@ -70,7 +66,6 @@ if __name__ == '__main__':
# Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig(
eval_interval=300,
use_data_parallel=args.use_data_parallel,
use_cuda=args.use_gpu,
num_epoch=args.num_epoch,
......@@ -80,10 +75,7 @@ if __name__ == '__main__':
# Define a regression fine-tune task by PaddleHub's API
reg_task = hub.RegressionTask(
data_reader=reader,
feature=pooled_output,
feed_list=feed_list,
config=config)
dataset=dataset, feature=pooled_output, config=config)
# Fine-tune and evaluate by PaddleHub's API
# will finish training, evaluation, testing, save model automatically
......
......@@ -42,30 +42,16 @@ if __name__ == '__main__':
module = hub.Module(name="ernie_tiny")
inputs, outputs, program = module.context(max_seq_len=args.max_seq_len)
# Sentence labeling dataset reader
# Download dataset and get its label list and label num
# If you just want labels information, you can omit its tokenizer parameter to avoid preprocessing the train set.
dataset = hub.dataset.MSRA_NER()
reader = hub.reader.SequenceLabelReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len,
sp_model_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
inv_label_map = {val: key for key, val in reader.label_map.items()}
num_classes = dataset.num_labels
label_list = dataset.get_labels()
# Construct transfer learning network
# Use "sequence_output" for token-level output.
sequence_output = outputs["sequence_output"]
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig(
use_data_parallel=False,
......@@ -77,33 +63,31 @@ if __name__ == '__main__':
# Define a sequence labeling fine-tune task by PaddleHub's API
# if add crf, the network use crf as decoder
seq_label_task = hub.SequenceLabelTask(
data_reader=reader,
feature=sequence_output,
feed_list=feed_list,
max_seq_len=args.max_seq_len,
num_classes=dataset.num_labels,
num_classes=num_classes,
config=config,
add_crf=False)
# Data to be predicted
# If using python 2, prefix "u" is necessary
data = [
[u"我们变而以书会友,以书结缘,把欧美、港台流行的食品类图谱、画册、工具书汇集一堂。"],
[u"为了跟踪国际最新食品工艺、流行趋势,大量搜集海外专业书刊资料是提高技艺的捷径。"],
[u"其中线装古籍逾千册;民国出版物几百种;珍本四册、稀见本四百余册,出版时间跨越三百余年。"],
[u"有的古木交柯,春机荣欣,从诗人句中得之,而入画中,观之令人心驰。"],
[u"不过重在晋趣,略增明人气息,妙在集古有道、不露痕迹罢了。"],
text_a = [
"我们变而以书会友,以书结缘,把欧美、港台流行的食品类图谱、画册、工具书汇集一堂。",
"为了跟踪国际最新食品工艺、流行趋势,大量搜集海外专业书刊资料是提高技艺的捷径。",
"其中线装古籍逾千册;民国出版物几百种;珍本四册、稀见本四百余册,出版时间跨越三百余年。",
"有的古木交柯,春机荣欣,从诗人句中得之,而入画中,观之令人心驰。",
"不过重在晋趣,略增明人气息,妙在集古有道、不露痕迹罢了。",
]
# Add 0x02 between characters to match the format of training data,
# otherwise the length of prediction results will not match the input string
# if the input string contains non-Chinese characters.
tmp_data = []
for example in data:
formatted = []
for sentence in example:
formatted.append('\x02'.join(list(sentence)))
tmp_data.append(formatted)
data = tmp_data
formatted_text_a = list(map("\002".join, text_a))
print(seq_label_task.predict(data=data, return_result=True))
# Use the appropriate tokenizer to preprocess the data
# For ernie_tiny, it use BertTokenizer too.
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
encoded_data = [
tokenizer.encode(text=text, max_seq_len=args.max_seq_len)
for text in formatted_text_a
]
print(seq_label_task.predict(data=encoded_data, label_list=label_list))
......@@ -40,26 +40,16 @@ if __name__ == '__main__':
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use SequenceLabelReader to read dataset
dataset = hub.dataset.MSRA_NER()
reader = hub.reader.SequenceLabelReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len,
sp_model_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
# Use the appropriate tokenizer to preprocess the data set
# For ernie_tiny, it use BertTokenizer too.
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
dataset = hub.dataset.MSRA_NER(
tokenizer=tokenizer, max_seq_len=args.max_seq_len)
# Construct transfer learning network
# Use "sequence_output" for token-level output.
sequence_output = outputs["sequence_output"]
# Setup feed list for data feeder
# Must feed all the tensor of module need
feed_list = [
inputs["input_ids"].name, inputs["position_ids"].name,
inputs["segment_ids"].name, inputs["input_mask"].name
]
# Select a fine-tune strategy
strategy = hub.AdamWeightDecayStrategy(
warmup_proportion=args.warmup_proportion,
......@@ -78,9 +68,8 @@ if __name__ == '__main__':
# Define a sequence labeling fine-tune task by PaddleHub's API
# If add crf, the network use crf as decoder
seq_label_task = hub.SequenceLabelTask(
data_reader=reader,
dataset=dataset,
feature=sequence_output,
feed_list=feed_list,
max_seq_len=args.max_seq_len,
num_classes=dataset.num_labels,
config=config,
......
......@@ -21,9 +21,9 @@ parser.add_argument("--max_seq_len", type=int, default=512,
# yapf: enable.
class TransformerSequenceLabelLayer(fluid.dygraph.Layer):
class TransformerSeqLabeling(fluid.dygraph.Layer):
def __init__(self, num_classes, transformer):
super(TransformerSequenceLabelLayer, self).__init__()
super(TransformerSeqLabeling, self).__init__()
self.num_classes = num_classes
self.transformer = transformer
self.fc = Linear(input_dim=768, output_dim=num_classes)
......@@ -39,11 +39,15 @@ class TransformerSequenceLabelLayer(fluid.dygraph.Layer):
def finetune(args):
ernie = hub.Module(name="ernie", max_seq_len=args.max_seq_len)
module = hub.Module(name="ernie", max_seq_len=args.max_seq_len)
# Use the appropriate tokenizer to preprocess the data set
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
dataset = hub.dataset.MSRA_NER(
tokenizer=tokenizer, max_seq_len=args.max_seq_len)
with fluid.dygraph.guard():
dataset = hub.dataset.MSRA_NER()
ts = TransformerSequenceLabelLayer(
num_classes=dataset.num_labels, transformer=ernie)
ts = TransformerSeqLabeling(
num_classes=dataset.num_labels, transformer=module)
adam = AdamOptimizer(learning_rate=1e-5, parameter_list=ts.parameters())
state_dict_path = os.path.join(args.checkpoint_dir,
'dygraph_state_dict')
......@@ -51,34 +55,32 @@ def finetune(args):
state_dict, _ = fluid.load_dygraph(state_dict_path)
ts.load_dict(state_dict)
reader = hub.reader.SequenceLabelReader(
dataset=dataset,
vocab_path=ernie.get_vocab_path(),
max_seq_len=args.max_seq_len,
sp_model_path=ernie.get_spm_path(),
word_dict_path=ernie.get_word_dict_path())
train_reader = reader.data_generator(
batch_size=args.batch_size, phase='train')
loss_sum = total_infer = total_label = total_correct = cnt = 0
# 执行epoch_num次训练
for epoch in range(args.num_epoch):
# 读取训练数据进行训练
for batch_id, data in enumerate(train_reader()):
input_ids = np.array(data[0][0]).astype(np.int64)
position_ids = np.array(data[0][1]).astype(np.int64)
segment_ids = np.array(data[0][2]).astype(np.int64)
input_mask = np.array(data[0][3]).astype(np.float32)
labels = np.array(data[0][4]).astype(np.int64).reshape(-1, 1)
seq_len = np.squeeze(
np.array(data[0][5]).astype(np.int64), axis=1)
for batch_id, data in enumerate(
dataset.batch_records_generator(
phase="train",
batch_size=args.batch_size,
shuffle=True,
pad_to_batch_max_seq_len=False)):
batch_size = len(data["input_ids"])
input_ids = np.array(data["input_ids"]).astype(
np.int64).reshape([batch_size, -1, 1])
position_ids = np.array(data["position_ids"]).astype(
np.int64).reshape([batch_size, -1, 1])
segment_ids = np.array(data["segment_ids"]).astype(
np.int64).reshape([batch_size, -1, 1])
input_mask = np.array(data["input_mask"]).astype(
np.float32).reshape([batch_size, -1, 1])
labels = np.array(data["label"]).astype(np.int64).reshape(-1, 1)
seq_len = np.array(data["seq_len"]).astype(np.int64).reshape(
-1, 1)
pred, ret_infers = ts(input_ids, position_ids, segment_ids,
input_mask)
loss = fluid.layers.cross_entropy(pred, to_variable(labels))
avg_loss = fluid.layers.mean(loss)
avg_loss.backward()
# 参数更新
adam.minimize(avg_loss)
loss_sum += avg_loss.numpy() * labels.shape[0]
......
......@@ -20,11 +20,7 @@ from __future__ import print_function
import argparse
import ast
import numpy as np
import os
import time
import paddle
import paddle.fluid as fluid
import paddlehub as hub
# yapf: disable
......@@ -43,32 +39,11 @@ if __name__ == '__main__':
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use accuracy as metrics
# Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC
# Download dataset and get its label list and label num
# If you just want labels information, you can omit its tokenizer parameter to avoid preprocessing the train set.
dataset = hub.dataset.ChnSentiCorp()
# For ernie_tiny, it use sub-word to tokenize chinese sentence
# If not ernie tiny, sp_model_path and word_dict_path should be set None
reader = hub.reader.ClassifyReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len,
sp_model_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"]
# Setup feed list for data feeder
# Must feed all the tensor of module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
num_classes = dataset.num_labels
label_list = dataset.get_labels()
# Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig(
......@@ -80,14 +55,26 @@ if __name__ == '__main__':
# Define a classfication fine-tune task by PaddleHub's API
cls_task = hub.TextClassifierTask(
data_reader=reader,
feature=pooled_output,
feed_list=feed_list,
num_classes=dataset.num_labels,
feature=outputs["pooled_output"],
num_classes=num_classes,
config=config)
# Data to be prdicted
data = [["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"], ["交通方便;环境很好;服务态度很好 房间较小"],
["19天硬盘就罢工了~~~算上运来的一周都没用上15天~~~可就是不能换了~~~唉~~~~你说这算什么事呀~~~"]]
print(cls_task.predict(data=data, return_result=True))
text_a = [
"这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般", "交通方便;环境很好;服务态度很好 房间较小",
"19天硬盘就罢工了~~~算上运来的一周都没用上15天~~~可就是不能换了~~~唉~~~~你说这算什么事呀~~~"
]
# Use the appropriate tokenizer to preprocess the data
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
encoded_data = [
tokenizer.encode(text=text, max_seq_len=args.max_seq_len)
for text in text_a
]
print(cls_task.predict(data=encoded_data, label_list=label_list))
......@@ -20,11 +20,7 @@ from __future__ import print_function
import argparse
import ast
import numpy as np
import os
import time
import paddle
import paddle.fluid as fluid
import paddlehub as hub
# yapf: disable
......@@ -44,33 +40,17 @@ if __name__ == '__main__':
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use accuracy as metrics
# Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC
# Download dataset and get its label list and label num
# If you just want labels information, you can omit its tokenizer parameter to avoid preprocessing the train set.
dataset = hub.dataset.ChnSentiCorp()
# For ernie_tiny, it use sub-word to tokenize chinese sentence
# If not ernie tiny, sp_model_path and word_dict_path should be set None
reader = hub.reader.ClassifyReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len,
sp_model_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
num_classes = dataset.num_labels
label_list = dataset.get_labels()
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
token_feature = outputs["sequence_output"]
# Setup feed list for data feeder
# Must feed all the tensor of module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Setup RunConfig for PaddleHub Fine-tune API
config = hub.RunConfig(
use_data_parallel=args.use_data_parallel,
......@@ -85,15 +65,27 @@ if __name__ == '__main__':
# you must use the outputs["sequence_output"] as the token_feature of TextClassifierTask,
# rather than outputs["pooled_output"], and feature is None
cls_task = hub.TextClassifierTask(
data_reader=reader,
token_feature=token_feature,
feed_list=feed_list,
network=args.network,
num_classes=dataset.num_labels,
config=config)
# Data to be prdicted
data = [["这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"], ["交通方便;环境很好;服务态度很好 房间较小"],
["19天硬盘就罢工了~~~算上运来的一周都没用上15天~~~可就是不能换了~~~唉~~~~你说这算什么事呀~~~"]]
print(cls_task.predict(data=data, return_result=True))
text_a = [
"这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般", "交通方便;环境很好;服务态度很好 房间较小",
"19天硬盘就罢工了~~~算上运来的一周都没用上15天~~~可就是不能换了~~~唉~~~~你说这算什么事呀~~~"
]
# Use the appropriate tokenizer to preprocess the data
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
encoded_data = [
tokenizer.encode(text=text, max_seq_len=args.max_seq_len)
for text in text_a
]
print(cls_task.predict(data=encoded_data, label_list=label_list))
......@@ -7,5 +7,5 @@ python -u predict_predefine_net.py \
--checkpoint_dir=$CKPT_DIR \
--max_seq_len=128 \
--use_gpu=True \
--batch_size=24 \
--batch_size=1 \
--network=bilstm
......@@ -40,11 +40,23 @@ class TransformerClassifier(fluid.dygraph.Layer):
def finetune(args):
ernie = hub.Module(name="ernie", max_seq_len=args.max_seq_len)
module = hub.Module(name="ernie", max_seq_len=args.max_seq_len)
# Use the appropriate tokenizer to preprocess the data set
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path(),
)
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
dataset = hub.dataset.ChnSentiCorp(
tokenizer=tokenizer, max_seq_len=args.max_seq_len)
with fluid.dygraph.guard():
dataset = hub.dataset.ChnSentiCorp()
tc = TransformerClassifier(
num_classes=dataset.num_labels, transformer=ernie)
num_classes=dataset.num_labels, transformer=module)
adam = AdamOptimizer(learning_rate=1e-5, parameter_list=tc.parameters())
state_dict_path = os.path.join(args.checkpoint_dir,
'dygraph_state_dict')
......@@ -52,32 +64,31 @@ def finetune(args):
state_dict, _ = fluid.load_dygraph(state_dict_path)
tc.load_dict(state_dict)
reader = hub.reader.ClassifyReader(
dataset=dataset,
vocab_path=ernie.get_vocab_path(),
max_seq_len=args.max_seq_len,
sp_model_path=ernie.get_spm_path(),
word_dict_path=ernie.get_word_dict_path())
train_reader = reader.data_generator(
batch_size=args.batch_size, phase='train')
loss_sum = acc_sum = cnt = 0
# 执行epoch_num次训练
for epoch in range(args.num_epoch):
# 读取训练数据进行训练
for batch_id, data in enumerate(train_reader()):
input_ids = np.array(data[0][0]).astype(np.int64)
position_ids = np.array(data[0][1]).astype(np.int64)
segment_ids = np.array(data[0][2]).astype(np.int64)
input_mask = np.array(data[0][3]).astype(np.float32)
labels = np.array(data[0][4]).astype(np.int64)
for batch_id, data in enumerate(
dataset.batch_records_generator(
phase="train",
batch_size=args.batch_size,
shuffle=True,
pad_to_batch_max_seq_len=False)):
batch_size = len(data["input_ids"])
input_ids = np.array(data["input_ids"]).astype(
np.int64).reshape([batch_size, -1, 1])
position_ids = np.array(data["position_ids"]).astype(
np.int64).reshape([batch_size, -1, 1])
segment_ids = np.array(data["segment_ids"]).astype(
np.int64).reshape([batch_size, -1, 1])
input_mask = np.array(data["input_mask"]).astype(
np.float32).reshape([batch_size, -1, 1])
labels = np.array(data["label"]).astype(np.int64).reshape(
[batch_size, 1])
pred = tc(input_ids, position_ids, segment_ids, input_mask)
acc = fluid.layers.accuracy(pred, to_variable(labels))
loss = fluid.layers.cross_entropy(pred, to_variable(labels))
avg_loss = fluid.layers.mean(loss)
avg_loss.backward()
# 参数更新
adam.minimize(avg_loss)
loss_sum += avg_loss.numpy() * labels.shape[0]
......
......@@ -16,6 +16,7 @@
import argparse
import ast
import paddlehub as hub
# yapf: disable
......@@ -39,35 +40,24 @@ if __name__ == '__main__':
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use accuracy as metrics
# Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC
# metric should be acc, f1 or matthews
dataset = hub.dataset.ChnSentiCorp()
metrics_choices = ["acc"]
# Use the appropriate tokenizer to preprocess the data set
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
# For ernie_tiny, it use sub-word to tokenize chinese sentence
# If not ernie tiny, sp_model_path and word_dict_path should be set None
reader = hub.reader.ClassifyReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len,
sp_model_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
dataset = hub.dataset.ChnSentiCorp(
tokenizer=tokenizer, max_seq_len=args.max_seq_len)
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
pooled_output = outputs["pooled_output"]
# Setup feed list for data feeder
# Must feed all the tensor of module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Select fine-tune strategy, setup config and fine-tune
strategy = hub.AdamWeightDecayStrategy(
warmup_proportion=args.warmup_proportion,
......@@ -85,12 +75,11 @@ if __name__ == '__main__':
# Define a classfication fine-tune task by PaddleHub's API
cls_task = hub.TextClassifierTask(
data_reader=reader,
dataset=dataset,
feature=pooled_output,
feed_list=feed_list,
num_classes=dataset.num_labels,
config=config,
metrics_choices=metrics_choices)
metrics_choices=["acc"])
# Fine-tune and evaluate by PaddleHub's API
# will finish training, evaluation, testing, save model automatically
......
......@@ -40,35 +40,24 @@ if __name__ == '__main__':
inputs, outputs, program = module.context(
trainable=True, max_seq_len=args.max_seq_len)
# Download dataset and use accuracy as metrics
# Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC
# metric should be acc, f1 or matthews
dataset = hub.dataset.ChnSentiCorp()
metrics_choices = ["acc"]
# Use the appropriate tokenizer to preprocess the data set
# For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9
if module.name == "ernie_tiny":
tokenizer = hub.ErnieTinyTokenizer(
vocab_file=module.get_vocab_path(),
spm_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
else:
tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path())
# For ernie_tiny, it use sub-word to tokenize chinese sentence
# If not ernie tiny, sp_model_path and word_dict_path should be set None
reader = hub.reader.ClassifyReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
max_seq_len=args.max_seq_len,
sp_model_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path())
dataset = hub.dataset.ChnSentiCorp(
tokenizer=tokenizer, max_seq_len=args.max_seq_len)
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
token_feature = outputs["sequence_output"]
# Setup feed list for data feeder
# Must feed all the tensor of module need
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
# Select fine-tune strategy, setup config and fine-tune
strategy = hub.AdamWeightDecayStrategy(
warmup_proportion=args.warmup_proportion,
......@@ -90,13 +79,12 @@ if __name__ == '__main__':
# you must use the outputs["sequence_output"] as the token_feature of TextClassifierTask,
# rather than outputs["pooled_output"], and feature is None
cls_task = hub.TextClassifierTask(
data_reader=reader,
dataset=dataset,
token_feature=token_feature,
feed_list=feed_list,
network=args.network,
num_classes=dataset.num_labels,
config=config,
metrics_choices=metrics_choices)
metrics_choices=["acc"])
# Fine-tune and evaluate by PaddleHub's API
# will finish training, evaluation, testing, save model automatically
......
......@@ -31,6 +31,7 @@ from . import dataset
from . import finetune
from . import reader
from . import network
from . import tokenizer
from .common.dir import USER_HOME
from .common.dir import HUB_HOME
......@@ -70,3 +71,6 @@ from .finetune.strategy import CombinedStrategy
from .autofinetune.evaluator import report_final_result
from .module.nlp_module import NLPPredictionModule, TransformerModule
from .tokenizer.bert_tokenizer import BertTokenizer
from .tokenizer.bert_tokenizer import ErnieTinyTokenizer
......@@ -20,11 +20,16 @@ from __future__ import print_function
import os
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
class BQ(BaseNLPDataset):
def __init__(self):
class BQ(TextClassificationDataset):
"""
The Bank Question (BQ) corpus, a Chinese corpus for sentence semantic equivalence identification (SSEI),
contains 120,000 question pairs from 1-year online bank custom service logs.
"""
def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "bq")
base_path = self._download_dataset(
dataset_dir,
......@@ -36,18 +41,16 @@ class BQ(BaseNLPDataset):
test_file="test.txt",
label_file=None,
label_list=["0", "1"],
)
tokenizer=tokenizer,
max_seq_len=max_seq_len)
if __name__ == "__main__":
ds = BQ()
print("first 10 dev")
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
ds = BQ(tokenizer=BertTokenizer(vocab_file='vocab.txt'), max_seq_len=10)
print("first 10 dev examples")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print("first 10 train")
for e in ds.get_train_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print("first 10 test")
for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
......@@ -23,16 +23,16 @@ import csv
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
class ChnSentiCorp(BaseNLPDataset):
class ChnSentiCorp(TextClassificationDataset):
"""
ChnSentiCorp (by Tan Songbo at ICT of Chinese Academy of Sciences, and for
opinion mining)
"""
def __init__(self):
def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "chnsenticorp")
base_path = self._download_dataset(
dataset_dir,
......@@ -44,7 +44,8 @@ class ChnSentiCorp(BaseNLPDataset):
test_file="test.tsv",
label_file=None,
label_list=["0", "1"],
)
tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
......@@ -63,6 +64,13 @@ class ChnSentiCorp(BaseNLPDataset):
if __name__ == "__main__":
ds = ChnSentiCorp()
for e in ds.get_train_examples()[:10]:
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = ChnSentiCorp(tokenizer=tokenizer, max_seq_len=10)
print("first 10 dev examples")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
......@@ -20,7 +20,7 @@ import os
from paddlehub.reader import tokenization
from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
from paddlehub.dataset.base_nlp_dataset import MRCDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/cmrc2018.tar.gz"
SPIECE_UNDERLINE = '▁'
......@@ -62,10 +62,14 @@ class CMRC2018Example(object):
return s
class CMRC2018(BaseNLPDataset):
class CMRC2018(MRCDataset):
"""A single set of features of data."""
def __init__(self):
def __init__(self,
tokenizer=None,
max_seq_len=None,
max_query_len=64,
doc_stride=128):
dataset_dir = os.path.join(DATA_HOME, "cmrc2018")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(CMRC2018, self).__init__(
......@@ -75,6 +79,10 @@ class CMRC2018(BaseNLPDataset):
test_file=None,
label_file=None,
label_list=None,
tokenizer=tokenizer,
max_seq_len=max_seq_len,
max_query_len=max_query_len,
doc_stride=doc_stride,
)
def _read_file(self, input_file, phase=False):
......@@ -201,7 +209,9 @@ class CMRC2018(BaseNLPDataset):
if __name__ == "__main__":
print("begin")
ds = CMRC2018()
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = CMRC2018(tokenizer=tokenizer, max_seq_len=50)
print("train")
examples = ds.get_train_examples()
for index, e in enumerate(examples):
......
......@@ -121,6 +121,20 @@ class BaseDataset(object):
def get_predict_examples(self):
return self.predict_examples
def get_examples(self, phase):
if phase == "train":
return self.get_train_examples()
elif phase == "dev":
return self.get_dev_examples()
elif phase == "test":
return self.get_test_examples()
elif phase == "val":
return self.get_val_examples()
elif phase == "predict":
return self.get_predict_examples()
else:
raise ValueError("Invalid phase: %s" % phase)
def get_labels(self):
return self.label_list
......
......@@ -20,7 +20,7 @@ import os
from paddlehub.reader import tokenization
from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
from paddlehub.dataset.base_nlp_dataset import MRCDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/drcd.tar.gz"
SPIECE_UNDERLINE = '▁'
......@@ -62,10 +62,16 @@ class DRCDExample(object):
return s
class DRCD(BaseNLPDataset):
class DRCD(MRCDataset):
"""A single set of features of data."""
def __init__(self):
def __init__(
self,
tokenizer=None,
max_seq_len=None,
max_query_len=64,
doc_stride=128,
):
dataset_dir = os.path.join(DATA_HOME, "drcd")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(DRCD, self).__init__(
......@@ -75,6 +81,10 @@ class DRCD(BaseNLPDataset):
test_file="DRCD_test.json",
label_file=None,
label_list=None,
tokenizer=tokenizer,
max_seq_len=max_seq_len,
max_query_len=max_query_len,
doc_stride=doc_stride,
)
def _read_file(self, input_file, phase=None):
......@@ -176,8 +186,8 @@ class DRCD(BaseNLPDataset):
cleaned_answer_text = "".join(
tokenization.whitespace_tokenize(orig_answer_text))
if actual_text.find(cleaned_answer_text) == -1:
logger.warning((actual_text, " vs ",
cleaned_answer_text, " in ", qa))
logger.warning("Could not find answer: '%s' vs. '%s'" %
(actual_text, cleaned_answer_text))
continue
example = DRCDExample(
qas_id=qas_id,
......@@ -191,7 +201,9 @@ class DRCD(BaseNLPDataset):
if __name__ == "__main__":
ds = DRCD()
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = DRCD(tokenizer=tokenizer, max_seq_len=50)
print("train")
examples = ds.get_train_examples()
for index, e in enumerate(examples):
......
......@@ -36,7 +36,7 @@ class GLUE(BaseNLPDataset):
for more information
"""
def __init__(self, sub_dataset='SST-2'):
def __init__(self, sub_dataset='SST-2', tokenizer=None, max_seq_len=None):
# sub_dataset : CoLA, MNLI, MRPC, QNLI, QQP, RTE, SST-2, STS-B
if sub_dataset not in [
'CoLA', 'MNLI', 'MNLI_m', 'MNLI_mm', 'MRPC', 'QNLI', 'QQP',
......@@ -85,7 +85,8 @@ class GLUE(BaseNLPDataset):
predict_file=predict_file,
label_file=None,
label_list=label_list,
)
tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
......@@ -165,11 +166,13 @@ class GLUE(BaseNLPDataset):
if __name__ == "__main__":
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
for sub_dataset in [
'CoLA', 'MNLI', 'MRPC', 'QNLI', 'QQP', 'RTE', 'SST-2', 'STS-B'
]:
print(sub_dataset)
ds = GLUE(sub_dataset=sub_dataset)
ds = GLUE(sub_dataset=sub_dataset, tokenizer=tokenizer, max_seq_len=10)
for e in ds.get_train_examples()[:2]:
print(e)
print()
......@@ -182,3 +185,6 @@ if __name__ == "__main__":
for e in ds.get_predict_examples()[:2]:
print(e)
print()
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
......@@ -22,13 +22,13 @@ import os
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/iflytek.tar.gz"
class IFLYTEK(BaseNLPDataset):
def __init__(self):
class IFLYTEK(TextClassificationDataset):
def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "iflytek")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(IFLYTEK, self).__init__(
......@@ -38,7 +38,8 @@ class IFLYTEK(BaseNLPDataset):
test_file="test.txt",
label_file=None,
label_list=[str(i) for i in range(119)],
)
tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
......@@ -56,7 +57,9 @@ class IFLYTEK(BaseNLPDataset):
if __name__ == "__main__":
ds = IFLYTEK()
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = IFLYTEK(tokenizer=tokenizer, max_seq_len=10)
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
......@@ -67,3 +70,6 @@ if __name__ == "__main__":
for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
......@@ -23,17 +23,17 @@ import csv
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/inews.tar.gz"
class INews(BaseNLPDataset):
class INews(TextClassificationDataset):
"""
INews is a sentiment analysis dataset for Internet News
"""
def __init__(self):
def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "inews")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(INews, self).__init__(
......@@ -43,7 +43,8 @@ class INews(BaseNLPDataset):
test_file="test.txt",
label_file=None,
label_list=["0", "1", "2"],
)
tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
......@@ -60,7 +61,10 @@ class INews(BaseNLPDataset):
if __name__ == "__main__":
ds = INews()
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = INews(tokenizer=tokenizer, max_seq_len=10)
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
......@@ -71,3 +75,6 @@ if __name__ == "__main__":
for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
......@@ -23,13 +23,13 @@ import csv
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/lcqmc.tar.gz"
class LCQMC(BaseNLPDataset):
def __init__(self):
class LCQMC(TextClassificationDataset):
def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "lcqmc")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(LCQMC, self).__init__(
......@@ -39,7 +39,8 @@ class LCQMC(BaseNLPDataset):
test_file="test.tsv",
label_file=None,
label_list=["0", "1"],
)
tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
......@@ -58,7 +59,10 @@ class LCQMC(BaseNLPDataset):
if __name__ == "__main__":
ds = LCQMC()
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = LCQMC(tokenizer=tokenizer, max_seq_len=512)
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
......@@ -69,3 +73,7 @@ if __name__ == "__main__":
for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
......@@ -23,12 +23,12 @@ import csv
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
from paddlehub.dataset.base_nlp_dataset import SeqLabelingDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/msra_ner.tar.gz"
class MSRA_NER(BaseNLPDataset):
class MSRA_NER(SeqLabelingDataset):
"""
A set of manually annotated Chinese word-segmentation data and
specifications for training and testing a Chinese word-segmentation system
......@@ -36,7 +36,7 @@ class MSRA_NER(BaseNLPDataset):
https://www.microsoft.com/en-us/download/details.aspx?id=52531
"""
def __init__(self):
def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "msra_ner")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(MSRA_NER, self).__init__(
......@@ -48,7 +48,8 @@ class MSRA_NER(BaseNLPDataset):
label_list=[
"B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "O"
],
)
tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
......@@ -67,7 +68,9 @@ class MSRA_NER(BaseNLPDataset):
if __name__ == "__main__":
ds = MSRA_NER()
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = MSRA_NER(tokenizer=tokenizer, max_seq_len=30)
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
......@@ -78,3 +81,6 @@ if __name__ == "__main__":
for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
......@@ -23,19 +23,19 @@ import csv
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/nlpcc-dbqa.tar.gz"
class NLPCC_DBQA(BaseNLPDataset):
class NLPCC_DBQA(TextClassificationDataset):
"""
Please refer to
http://tcci.ccf.org.cn/conference/2017/dldoc/taskgline05.pdf
for more information
"""
def __init__(self):
def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "nlpcc-dbqa")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(NLPCC_DBQA, self).__init__(
......@@ -45,7 +45,8 @@ class NLPCC_DBQA(BaseNLPDataset):
test_file="test.tsv",
label_file=None,
label_list=["0", "1"],
)
tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
......@@ -64,7 +65,9 @@ class NLPCC_DBQA(BaseNLPDataset):
if __name__ == "__main__":
ds = NLPCC_DBQA()
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = NLPCC_DBQA(tokenizer=tokenizer, max_seq_len=10)
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
......@@ -75,3 +78,6 @@ if __name__ == "__main__":
for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
......@@ -20,7 +20,7 @@ import os
from paddlehub.reader import tokenization
from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
from paddlehub.dataset.base_nlp_dataset import MRCDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/squad.tar.gz"
......@@ -65,10 +65,17 @@ class SquadExample(object):
return s
class SQUAD(BaseNLPDataset):
class SQUAD(MRCDataset):
"""A single set of features of data."""
def __init__(self, version_2_with_negative=False):
def __init__(
self,
version_2_with_negative=False,
tokenizer=None,
max_seq_len=None,
max_query_len=64,
doc_stride=128,
):
self.version_2_with_negative = version_2_with_negative
if not version_2_with_negative:
train_file = "train-v1.1.json"
......@@ -87,6 +94,10 @@ class SQUAD(BaseNLPDataset):
test_file=None,
label_file=None,
label_list=None,
tokenizer=tokenizer,
max_seq_len=max_seq_len,
max_query_len=max_query_len,
doc_stride=doc_stride,
)
def _read_file(self, input_file, phase=None):
......@@ -177,7 +188,10 @@ class SQUAD(BaseNLPDataset):
if __name__ == "__main__":
ds = SQUAD(version_2_with_negative=True)
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = SQUAD(
version_2_with_negative=True, tokenizer=tokenizer, max_seq_len=512)
print("first 10 dev")
for e in ds.get_dev_examples()[:2]:
print(e)
......
......@@ -22,13 +22,13 @@ import os
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/thucnews.tar.gz"
class THUCNEWS(BaseNLPDataset):
def __init__(self):
class THUCNEWS(TextClassificationDataset):
def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "thucnews")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
super(THUCNEWS, self).__init__(
......@@ -38,7 +38,8 @@ class THUCNEWS(BaseNLPDataset):
test_file="test.txt",
label_file=None,
label_list=[str(i) for i in range(14)],
)
tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
......@@ -56,7 +57,9 @@ class THUCNEWS(BaseNLPDataset):
if __name__ == "__main__":
ds = THUCNEWS()
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = THUCNEWS(tokenizer=tokenizer, max_seq_len=10)
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
......@@ -67,3 +70,6 @@ if __name__ == "__main__":
for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
......@@ -20,7 +20,8 @@ from __future__ import print_function
import io
import os
from paddlehub.dataset import InputExample, BaseDataset
from paddlehub.dataset import InputExample
from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
from paddlehub.common.dir import DATA_HOME
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/tnews.tar.gz"
......@@ -44,12 +45,12 @@ LABEL_NAME = {
}
class TNews(BaseDataset):
class TNews(TextClassificationDataset):
"""
TNews is the chinese news classification dataset on Jinri Toutiao App.
"""
def __init__(self):
def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "tnews")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
label_list = [
......@@ -63,7 +64,8 @@ class TNews(BaseDataset):
test_file="toutiao_category_test.txt",
label_file=None,
label_list=label_list,
)
tokenizer=tokenizer,
max_seq_len=max_seq_len)
def get_label_name(self, id):
return LABEL_NAME[id]
......@@ -82,7 +84,9 @@ class TNews(BaseDataset):
if __name__ == "__main__":
ds = TNews()
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = TNews(tokenizer=tokenizer, max_seq_len=10)
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
......@@ -93,3 +97,6 @@ if __name__ == "__main__":
for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
......@@ -22,18 +22,18 @@ import pandas as pd
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
from paddlehub.dataset.base_nlp_dataset import MultiLabelDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/toxic.tar.gz"
class Toxic(BaseNLPDataset):
class Toxic(MultiLabelDataset):
"""
The kaggle Toxic dataset:
https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge
"""
def __init__(self):
def __init__(self, tokenizer=None, max_seq_len=None):
dataset_dir = os.path.join(DATA_HOME, "toxic")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
label_list = [
......@@ -47,7 +47,8 @@ class Toxic(BaseNLPDataset):
test_file="test.csv",
label_file=None,
label_list=label_list,
)
tokenizer=tokenizer,
max_seq_len=max_seq_len)
def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
......@@ -64,7 +65,10 @@ class Toxic(BaseNLPDataset):
if __name__ == "__main__":
ds = Toxic()
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = Toxic(tokenizer=tokenizer, max_seq_len=10)
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
......@@ -75,3 +79,6 @@ if __name__ == "__main__":
for e in ds.get_test_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
print(ds)
print("first 10 dev records")
for e in ds.get_dev_records()[:10]:
print(e)
......@@ -25,19 +25,19 @@ import csv
from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
from paddlehub.dataset.base_nlp_dataset import TextClassificationDataset
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/XNLI-lan.tar.gz"
class XNLI(BaseNLPDataset):
class XNLI(TextClassificationDataset):
"""
Please refer to
https://arxiv.org/pdf/1809.05053.pdf
for more information
"""
def __init__(self, language='zh'):
def __init__(self, language='zh', tokenizer=None, max_seq_len=None):
if language not in [
"ar", "bg", "de", "el", "en", "es", "fr", "hi", "ru", "sw",
"th", "tr", "ur", "vi", "zh"
......@@ -55,6 +55,8 @@ class XNLI(BaseNLPDataset):
test_file="%s_test.tsv" % language,
label_file=None,
label_list=["neutral", "contradiction", "entailment"],
tokenizer=tokenizer,
max_seq_len=max_seq_len,
)
def _read_file(self, input_file, phase=None):
......@@ -74,7 +76,10 @@ class XNLI(BaseNLPDataset):
if __name__ == "__main__":
ds = XNLI()
from paddlehub.tokenizer.bert_tokenizer import BertTokenizer
tokenizer = BertTokenizer(vocab_file='vocab.txt')
ds = XNLI(tokenizer=tokenizer, max_seq_len=20)
print("first 10 dev")
for e in ds.get_dev_examples()[:10]:
print("{}\t{}\t{}\t{}".format(e.guid, e.text_a, e.text_b, e.label))
......
......@@ -167,7 +167,7 @@ class DefaultStrategy(object):
self.optimizer = fluid.optimizer.Adam(
learning_rate=self.learning_rate, **kwargs)
def execute(self, loss, data_reader, config, dev_count):
def execute(self, loss, max_train_steps):
if self.optimizer is not None:
self.optimizer.minimize(loss)
else:
......@@ -456,26 +456,9 @@ class CombinedStrategy(DefaultStrategy):
"weight_decay"] * scheduled_lr
fluid.layers.assign(output=param, input=updated_param)
def execute(self, loss, data_reader, config, dev_count):
def execute(self, loss, max_train_steps):
# base information
self.main_program = loss.block.program
self.config = config
# self.num_examples = {'train': -1, 'dev': -1, 'test': -1} before data_generator
data_reader.data_generator(
batch_size=config.batch_size, phase='train', shuffle=True)
num_train_examples = data_reader.num_examples['train']
max_train_steps = config.num_epoch * num_train_examples // config.batch_size // dev_count
try:
# nlp_reader
_in_tokens = data_reader.in_tokens
if _in_tokens:
max_train_steps *= data_reader.max_seq_len
except:
# cv_reader without .in_tokens and .max_seq_len
pass
if self.scheduler["discriminative"]["blocks"] > 0 or self.scheduler[
"gradual_unfreeze"]["blocks"] > 0:
......@@ -494,8 +477,7 @@ class CombinedStrategy(DefaultStrategy):
self.regularization_handler(loss, scheduled_lr)
logger.info(self.__str__())
return scheduled_lr, max_train_steps
return scheduled_lr
def exclude_from_weight_decay(self, name):
if name.find("layer_norm") > -1:
......
......@@ -35,6 +35,7 @@ import paddle.fluid as fluid
from visualdl import LogWriter
import paddlehub as hub
from paddlehub.reader.nlp_reader import BaseNLPReader
from paddlehub.common.paddle_helper import dtype_map, clone_program
from paddlehub.common.utils import mkdir
from paddlehub.common.dir import tmp_dir
......@@ -84,7 +85,7 @@ class RunEnv(object):
self.start_program = None
self.main_program_compiled = None
self.py_reader = None
self.reader = None
self.generator = None
self.loss = None
self.labels = None
self.metrics = None
......@@ -260,8 +261,8 @@ class BaseTask(object):
BaseTask is the base class of all the task. It will complete the building of all the running environment.
Args:
feed_list (list): the inputs name
data_reader (object): data reader for the task
feed_list (list): the inputs name. Deprecated in paddlehub v1.8.
data_reader (object): data reader for the task. Deprecated in paddlehub v1.8.
main_program (object): the customized main_program, default None
startup_program (object): the customized startup_program, default None
config (object): the config for the task, default None
......@@ -269,16 +270,13 @@ class BaseTask(object):
"""
def __init__(self,
feed_list,
data_reader,
dataset=None,
feed_list=None,
data_reader=None,
main_program=None,
startup_program=None,
config=None,
metrics_choices="default"):
# base item
self._base_data_reader = data_reader
self._base_feed_list = feed_list
# metrics item
self.best_score = -999
if metrics_choices == "default":
......@@ -293,7 +291,6 @@ class BaseTask(object):
if main_program is None:
self._base_main_program = clone_program(
fluid.default_main_program(), for_test=False)
else:
self._base_main_program = clone_program(
main_program, for_test=False)
......@@ -344,6 +341,23 @@ class BaseTask(object):
# set default phase
self.enter_phase("train")
self.dataset = dataset
if dataset:
self._label_list = dataset.get_labels()
# Compatible code for usage deprecated in paddlehub v1.8.
self._base_data_reader = data_reader
self._base_feed_list = feed_list
if isinstance(data_reader, BaseNLPReader):
self._compatible_mode = True
logger.warning(
"PaddleHub v1.8 has deprecated the reader and feed_list parameters in the nlp Task. We provided an easier usage, "
"in which you can use your tokenizer to preprocess dataset and run task in a clear flow. "
"New demo see https://github.com/PaddlePaddle/PaddleHub/blob/release/v1.8/demo/text_classification/text_cls.py"
)
else:
self._compatible_mode = False
@contextlib.contextmanager
def phase_guard(self, phase):
self.enter_phase(phase)
......@@ -420,9 +434,29 @@ class BaseTask(object):
with fluid.program_guard(self.env.main_program,
self._base_startup_program):
with fluid.unique_name.guard(self.env.UNG):
self.scheduled_lr, self.max_train_steps = self.config.strategy.execute(
self.loss, self._base_data_reader, self.config,
self.device_count)
if self._compatible_mode:
# This branch is compatible code for usage deprecated in paddlehub v1.8.
self._base_data_reader.data_generator(
batch_size=self.config.batch_size,
phase='train',
shuffle=True)
num_train_examples = self._base_data_reader.num_examples[
'train']
try:
# nlp_reader
_in_tokens = self._base_data_reader.in_tokens
if _in_tokens:
num_train_examples *= self._base_data_reader.max_seq_len
except:
# cv_reader without .in_tokens and .max_seq_len
pass
else:
num_train_examples = len(
self.dataset.get_train_records())
self.max_train_steps = self.config.num_epoch * num_train_examples // self.config.batch_size // self.device_count
self.scheduled_lr = self.config.strategy.execute(
self.loss, self.max_train_steps)
if self.is_train_phase:
loss_name = self.env.loss.name
......@@ -529,17 +563,40 @@ class BaseTask(object):
return self.main_program
@property
def reader(self):
if self.is_predict_phase:
data = self._predict_data
def generator(self):
if self._compatible_mode:
if self.is_predict_phase:
data = self._predict_data
else:
data = None
self.env.generator = self._base_data_reader.data_generator(
batch_size=self.config.batch_size,
phase=self.phase,
data=data,
return_list=not self.config.use_pyreader)
else:
data = None
self.env.reader = self._base_data_reader.data_generator(
batch_size=self.config.batch_size,
phase=self.phase,
data=data,
return_list=not self.config.use_pyreader)
return self.env.reader
def data_generator(records):
def wrapper():
for record in records:
values = []
for feed_name in self.feed_list:
values.append(record[feed_name])
yield values
return wrapper
if self.is_predict_phase:
records = self._predict_data
else:
if self.is_train_phase:
shuffle = True
else:
shuffle = False
records = self.dataset.get_records(
phase=self.phase, shuffle=shuffle)
self.env.generator = data_generator(records)
return self.env.generator
@property
def loss(self):
......@@ -580,13 +637,30 @@ class BaseTask(object):
@property
def feed_list(self):
feed_list = [varname for varname in self._base_feed_list]
if self.is_train_phase or self.is_test_phase:
feed_list += [label.name for label in self.labels]
if self._compatible_mode:
feed_list = [varname for varname in self._base_feed_list]
if self.is_train_phase or self.is_test_phase:
feed_list += [label.name for label in self.labels]
else:
if not self.env.is_inititalized:
self._build_env()
if self._predict_data:
feed_list = list(self._predict_data[0].keys())
else:
feed_list = self.dataset.get_feed_list(self.phase)
feed_list = [
feed_name for feed_name in feed_list
if feed_name in self.main_program.global_block().vars
]
return feed_list
@property
def feed_var_list(self):
if not self.env.is_inititalized:
self._build_env()
vars = self.main_program.global_block().vars
return [vars[varname] for varname in self.feed_list]
......@@ -890,13 +964,20 @@ class BaseTask(object):
self.env.current_epoch += 1
# Final evaluation
if self._base_data_reader.get_dev_examples() != []:
if self._compatible_mode:
dev_examples = self._base_data_reader.get_dev_examples()
test_examples = self._base_data_reader.get_test_examples()
else:
dev_examples = self.dataset.get_dev_examples()
test_examples = self.dataset.get_test_examples()
if dev_examples != []:
# Warning: DO NOT use self.eval(phase="dev", load_best_model=True) during training.
# It will cause trainer unable to continue training from checkpoint after eval.
# More important, The model should evaluate current performance during training.
self.eval(phase="dev")
if self._base_data_reader.get_test_examples() != []:
if test_examples != []:
self.eval(phase="test", load_best_model=True)
# Save checkpoint after finetune
self.save_checkpoint()
......@@ -957,17 +1038,41 @@ class BaseTask(object):
global_run_states = []
period_run_states = []
for run_step, batch in enumerate(self.reader(), start=1):
feed_var_shape = []
feed_var_type = []
for var in self.feed_var_list:
feed_var_shape.append(var.shape)
feed_var_type.append(dtype_map[var.dtype])
if self._compatible_mode:
data_reader = self.generator
else:
data_reader = paddle.batch(
self.generator, batch_size=self.config.batch_size)
for batch in data_reader():
if self._compatible_mode and not self.config.use_pyreader:
# if not use pyreader, the nlp_reader return [batch]
batch = batch[0]
step_run_state = RunState(len(self.fetch_list))
step_run_state.run_step = 1
num_batch_examples = len(batch)
if not self.config.use_pyreader:
# if use pyreader, the nlp_reader return [batch]
batch = batch[0]
batch = [fluid.core.PaddleTensor(data) for data in batch]
fetch_result = self._predictor.run(batch)
# Preocessing data to the suitable shape and type for the model
processed_batch = [[] for i in range(len(self.feed_list))]
if self._compatible_mode:
processed_batch = batch
else:
for sample in batch:
for i, data in enumerate(sample):
processed_batch[i].append(data)
tensor_batch = [[] for i in range(len(self.feed_list))]
for i in range(len(processed_batch)):
processed_batch[i] = np.array(processed_batch[i]).reshape(
feed_var_shape[i]).astype(feed_var_type[i])
tensor_batch[i] = fluid.core.PaddleTensor(processed_batch[i])
fetch_result = self._predictor.run(tensor_batch)
for index, result in enumerate(fetch_result):
step_run_state.run_results[index] = result.as_ndarray()
step_run_state.run_examples += num_batch_examples
......@@ -978,18 +1083,23 @@ class BaseTask(object):
global_run_states += period_run_states
return global_run_states
def predict(self,
data,
load_best_model=True,
return_result=False,
accelerate_mode=True):
def predict(
self,
data=None,
label_list=None,
load_best_model=True,
return_result=False,
accelerate_mode=True,
):
"""
make prediction for the input data.
Args:
data (list): the data will be predicted.
data (list): the data will be predicted. Its element should be a record when the task is initialized without data_reader param,
or a plaintext string list when the task is initialized with data_reader param (deprecated in paddlehub v1.8).
label_list (list): the label list, used to proprocess the output.
load_best_model (bool): load the best model or not
return_result (bool): return a readable result or just the raw run result
return_result (bool): return a readable result or just the raw run result. Always True when the task is not initialized with data_reader param.
accelerate_mode (bool): use high-performance predictor or not
Returns:
......@@ -1005,6 +1115,7 @@ class BaseTask(object):
with self.phase_guard(phase="predict"):
self._predict_data = data
self._label_list = label_list
self._predict_start_event()
if load_best_model:
......@@ -1020,7 +1131,7 @@ class BaseTask(object):
self._predict_end_event(run_states)
self._predict_data = None
if return_result:
if return_result or not self._compatible_mode:
return self._postprocessing(run_states)
return run_states
......@@ -1057,20 +1168,34 @@ class BaseTask(object):
capacity=64,
use_double_buffer=True,
iterable=True)
data_reader = data_loader.set_batch_generator(
self.reader, places=self.places)
if self._compatible_mode:
data_reader = data_loader.set_batch_generator(
self.generator, places=self.places)
else:
data_reader = data_loader.set_sample_generator(
self.generator,
places=self.places,
batch_size=self.config.batch_size,
drop_last=True)
else:
data_feeder = fluid.DataFeeder(
feed_list=self.feed_list, place=self.place)
data_reader = data_feeder.decorate_reader(
self.reader,
multi_devices=self.config.use_data_parallel,
drop_last=True)
if self._compatible_mode:
data_reader = data_feeder.decorate_reader(
self.generator,
multi_devices=self.config.use_data_parallel,
drop_last=True)
else:
data_reader = data_feeder.decorate_reader(
paddle.batch(
self.generator, batch_size=self.config.batch_size),
multi_devices=self.config.use_data_parallel,
drop_last=True)
global_run_states = []
period_run_states = []
for run_step, batch in enumerate(data_reader(), start=1):
for batch in data_reader():
step_run_state = RunState(len(self.fetch_list))
step_run_state.run_step = 1
num_batch_examples = len(batch)
......@@ -1107,6 +1232,5 @@ class BaseTask(object):
return global_run_states
def __repr__(self):
return "Task: %s with metrics_choices: %s, reader: %s, %s" % (
self.__class__.__name__, self.metrics_choices,
self._base_data_reader.__class__.__name__, self.config)
return "Task: %s with metrics_choices: %s, %s" % (
self.__class__.__name__, self.metrics_choices, self.config)
......@@ -19,13 +19,12 @@ from __future__ import print_function
from collections import OrderedDict
import numpy as np
import paddle
import paddle.fluid as fluid
import time
from paddlehub.common.logger import logger
from paddlehub.finetune.evaluate import calculate_f1_np, matthews_corrcoef
from paddlehub.reader.nlp_reader import ClassifyReader
from paddlehub.reader.nlp_reader import ClassifyReader, LACClassifyReader
import paddlehub.network as net
from .base_task import BaseTask
......@@ -35,8 +34,9 @@ class ClassifierTask(BaseTask):
def __init__(self,
feature,
num_classes,
feed_list,
data_reader,
dataset=None,
feed_list=None,
data_reader=None,
startup_program=None,
config=None,
hidden_units=None,
......@@ -46,6 +46,7 @@ class ClassifierTask(BaseTask):
main_program = feature.block.program
super(ClassifierTask, self).__init__(
dataset=dataset,
data_reader=data_reader,
main_program=main_program,
feed_list=feed_list,
......@@ -109,7 +110,7 @@ class ClassifierTask(BaseTask):
run_examples += run_state.run_examples
run_step += run_state.run_step
loss_sum += np.mean(
run_state.run_results[-2]) * run_state.run_examples
run_state.run_results[-1]) * run_state.run_examples
acc_sum += np.mean(
run_state.run_results[2]) * run_state.run_examples
np_labels = run_state.run_results[0]
......@@ -140,20 +141,28 @@ class ClassifierTask(BaseTask):
return scores, avg_loss, run_speed
def _postprocessing(self, run_states):
try:
id2label = {
val: key
for key, val in self._base_data_reader.label_map.items()
}
except:
raise Exception(
"ImageClassificationDataset does not support postprocessing, please use BaseCVDataset instead"
)
if self._compatible_mode:
try:
label_list = list(self._base_data_reader.label_map.keys())
except:
raise Exception(
"ImageClassificationDataset does not support postprocessing, please use BaseCVDataset instead"
)
else:
if self._label_list:
label_list = self._label_list
else:
logger.warning(
"Fail to postprocess the predict output. Please set label_list parameter in predict function or initialize the task with dataset parameter."
)
return run_states
results = []
for batch_state in run_states:
batch_result = batch_state.run_results
batch_infer = np.argmax(batch_result[0], axis=1)
results += [id2label[sample_infer] for sample_infer in batch_infer]
results += [
label_list[sample_infer] for sample_infer in batch_infer
]
return results
......@@ -166,22 +175,24 @@ class TextClassifierTask(ClassifierTask):
It will use full-connect layer with softmax activation function to classify texts.
"""
def __init__(self,
num_classes,
feed_list,
data_reader,
feature=None,
token_feature=None,
network=None,
startup_program=None,
config=None,
hidden_units=None,
metrics_choices="default"):
def __init__(
self,
num_classes,
dataset=None,
feed_list=None, # Deprecated
data_reader=None, # Deprecated
feature=None,
token_feature=None,
network=None,
startup_program=None,
config=None,
hidden_units=None,
metrics_choices="default"):
"""
Args:
num_classes: total labels of the text classification task.
feed_list(list): the variable name that will be feeded to the main program
data_reader(object): data reader for the task. It must be one of ClassifyReader and LACClassifyReader.
feed_list(list): the variable name that will be feeded to the main program, Deprecated in paddlehub v1.8.
data_reader(object): data reader for the task. It must be one of ClassifyReader and LACClassifyReader, Deprecated in paddlehub v1.8..
feature(Variable): the `feature` will be used to classify texts. It must be the sentence-level feature, shape as [-1, emb_size]. `Token_feature` and `feature` couldn't be setted at the same time. One of them must be setted as not None. Default None.
token_feature(Variable): the `feature` will be used to connect the pre-defined network. It must be the token-level feature, shape as [-1, seq_len, emb_size]. Default None.
network(str): the pre-defined network. Choices: 'bilstm', 'bow', 'cnn', 'dpcnn', 'gru' and 'lstm'. Default None. If network is setted, then `token_feature` must be setted and `feature` must be None.
......@@ -193,12 +204,12 @@ class TextClassifierTask(ClassifierTask):
"""
if (not feature) and (not token_feature):
logger.error(
'Both token_feature and feature are None, one of them must be setted.'
'Both token_feature and feature are None, one of them must be set.'
)
exit(1)
elif feature and token_feature:
logger.error(
'Both token_feature and feature are setted. One should be setted, the other should be None.'
'Both token_feature and feature are set. One should be set, the other should be None.'
)
exit(1)
......@@ -226,6 +237,7 @@ class TextClassifierTask(ClassifierTask):
metrics_choices = ["acc"]
super(TextClassifierTask, self).__init__(
dataset=dataset,
data_reader=data_reader,
feature=feature if feature else token_feature,
num_classes=num_classes,
......@@ -236,16 +248,14 @@ class TextClassifierTask(ClassifierTask):
metrics_choices=metrics_choices)
def _build_net(self):
if isinstance(self._base_data_reader, ClassifyReader):
# ClassifyReader will return the seqence length of an input text
if not isinstance(self._base_data_reader, LACClassifyReader):
# LACClassifyReader wont return the seqence length, while Dataset with tokenizer and ClassifyReader will.
self.seq_len = fluid.layers.data(
name="seq_len", shape=[1], dtype='int64', lod_level=0)
self.seq_len_used = fluid.layers.squeeze(self.seq_len, axes=[1])
# unpad the token_feature
unpad_feature = fluid.layers.sequence_unpad(
self.feature, length=self.seq_len_used)
if self.network:
# add pre-defined net
net_func = getattr(net.classification, self.network)
......@@ -254,9 +264,14 @@ class TextClassifierTask(ClassifierTask):
cls_feats = net_func(
self.feature, emb_dim=self.feature.shape[-1])
else:
cls_feats = net_func(unpad_feature)
logger.info(
"%s has been added in the TextClassifierTask!" % self.network)
if self._compatible_mode and isinstance(self._base_data_reader,
LACClassifyReader):
cls_feats = net_func(self.feature)
else:
cls_feats = net_func(unpad_feature)
if self.is_train_phase:
logger.info("%s has been added in the TextClassifierTask!" %
self.network)
else:
# not use pre-defined net but to use fc net
cls_feats = fluid.layers.dropout(
......@@ -286,12 +301,15 @@ class TextClassifierTask(ClassifierTask):
@property
def feed_list(self):
feed_list = [varname for varname in self._base_feed_list]
if isinstance(self._base_data_reader, ClassifyReader):
# ClassifyReader will return the seqence length of an input text
feed_list += [self.seq_len.name]
if self.is_train_phase or self.is_test_phase:
feed_list += [self.labels[0].name]
if self._compatible_mode:
feed_list = [varname for varname in self._base_feed_list]
if isinstance(self._base_data_reader, ClassifyReader):
# ClassifyReader will return the seqence length of an input text
feed_list += [self.seq_len.name]
if self.is_train_phase or self.is_test_phase:
feed_list += [self.labels[0].name]
else:
feed_list = super(TextClassifierTask, self).feed_list
return feed_list
@property
......@@ -303,11 +321,10 @@ class TextClassifierTask(ClassifierTask):
]
else:
# predict phase
fetch_list = [self.outputs[0].name]
if isinstance(self._base_data_reader, ClassifyReader):
# to avoid save_inference_model to prune seq_len variable
fetch_list += [self.seq_len.name]
if isinstance(self._base_data_reader, LACClassifyReader):
fetch_list = [self.outputs[0].name]
else:
fetch_list = [self.outputs[0].name, self.seq_len.name]
return fetch_list
......@@ -316,8 +333,9 @@ class MultiLabelClassifierTask(ClassifierTask):
def __init__(self,
feature,
num_classes,
feed_list,
data_reader,
dataset=None,
feed_list=None,
data_reader=None,
startup_program=None,
config=None,
hidden_units=None,
......@@ -325,8 +343,8 @@ class MultiLabelClassifierTask(ClassifierTask):
if metrics_choices == "default":
metrics_choices = ["auc"]
main_program = feature.block.program
super(MultiLabelClassifierTask, self).__init__(
dataset=dataset,
data_reader=data_reader,
feature=feature,
num_classes=num_classes,
......@@ -335,7 +353,10 @@ class MultiLabelClassifierTask(ClassifierTask):
config=config,
hidden_units=hidden_units,
metrics_choices=metrics_choices)
self.class_name = list(data_reader.label_map.keys())
if self._compatible_mode:
self.class_name = list(data_reader.label_map.keys())
else:
self.class_name = self._label_list
def _build_net(self):
cls_feats = fluid.layers.dropout(
......@@ -428,13 +449,22 @@ class MultiLabelClassifierTask(ClassifierTask):
def _postprocessing(self, run_states):
results = []
label_list = list(self._base_data_reader.label_map.keys())
if self._compatible_mode:
label_list = list(self._base_data_reader.label_map.keys())
else:
if self._label_list:
label_list = self._label_list
else:
logger.warning(
"Fail to postprocess the predict output. Please set label_list parameter in predict function or initialize the task with dataset parameter."
)
return run_states
for batch_state in run_states:
batch_result = batch_state.run_results
for sample_id in range(len(batch_result[0])):
sample_result = []
for category_id in range(
self._base_data_reader.dataset.num_labels):
for category_id in range(len(label_list)):
sample_category_prob = batch_result[category_id][sample_id]
sample_category_value = np.argmax(sample_category_prob)
sample_result.append(
......
......@@ -29,8 +29,9 @@ from .base_task import BaseTask
class RegressionTask(BaseTask):
def __init__(self,
feature,
feed_list,
data_reader,
dataset=None,
feed_list=None,
data_reader=None,
startup_program=None,
config=None,
hidden_units=None,
......@@ -40,6 +41,7 @@ class RegressionTask(BaseTask):
main_program = feature.block.program
super(RegressionTask, self).__init__(
dataset=dataset,
data_reader=data_reader,
main_program=main_program,
feed_list=feed_list,
......
......@@ -21,10 +21,9 @@ import time
from collections import OrderedDict
import numpy as np
import paddle
import paddle.fluid as fluid
from paddlehub.finetune.evaluate import chunk_eval, calculate_f1
from paddlehub.common.utils import version_compare
from paddlehub.common.logger import logger
from .base_task import BaseTask
......@@ -33,8 +32,9 @@ class SequenceLabelTask(BaseTask):
feature,
max_seq_len,
num_classes,
feed_list,
data_reader,
dataset=None,
feed_list=None,
data_reader=None,
startup_program=None,
config=None,
metrics_choices="default",
......@@ -46,6 +46,7 @@ class SequenceLabelTask(BaseTask):
main_program = feature.block.program
super(SequenceLabelTask, self).__init__(
dataset=dataset,
data_reader=data_reader,
main_program=main_program,
feed_list=feed_list,
......@@ -199,11 +200,14 @@ class SequenceLabelTask(BaseTask):
@property
def feed_list(self):
feed_list = [varname for varname in self._base_feed_list]
if self.is_train_phase or self.is_test_phase:
feed_list += [self.labels[0].name, self.seq_len.name]
if self._compatible_mode:
feed_list = [varname for varname in self._base_feed_list]
if self.is_train_phase or self.is_test_phase:
feed_list += [self.labels[0].name, self.seq_len.name]
else:
feed_list += [self.seq_len.name]
else:
feed_list += [self.seq_len.name]
feed_list = super(SequenceLabelTask, self).feed_list
return feed_list
@property
......@@ -215,10 +219,22 @@ class SequenceLabelTask(BaseTask):
return [output.name for output in self.outputs]
def _postprocessing(self, run_states):
id2label = {
val: key
for key, val in self._base_data_reader.label_map.items()
}
if self._compatible_mode:
id2label = {
val: key
for key, val in self._base_data_reader.label_map.items()
}
else:
if self._label_list:
id2label = {}
for index, label in enumerate(self._label_list):
id2label[index] = label
else:
logger.warning(
"Fail to postprocess the predict output. Please set label_list parameter in predict function or initialize the task with dataset parameter."
)
return run_states
results = []
for batch_states in run_states:
batch_results = batch_states.run_results
......
......@@ -688,11 +688,13 @@ class Features(object):
s = ""
s += "unique_id: %s " % self.unique_id
s += "example_index: %s " % self.example_index
s += "doc_span_index: %s" % self.doc_span_index
s += "tokens: %s" % self.tokens
s += "token_to_orig_map %s" % self.token_to_orig_map
s += "token_is_max_context %s" % self.token_is_max_context
s += "start_position: %s " % self.start_position
s += "end_position: %s " % self.end_position
s += "is_impossible: %s " % self.is_impossible
# s += "tokens: %s" % self.tokens
# s += "token_to_orig_map %s" % self.token_to_orig_map
return s
......
......@@ -140,29 +140,6 @@ class FullTokenizer(object):
return convert_by_vocab(self.inv_vocab, ids)
class CharTokenizer(object):
"""Runs end-to-end tokenziation."""
def __init__(self, vocab_file, do_lower_case=True):
self.vocab = load_vocab(vocab_file)
self.inv_vocab = {v: k for k, v in self.vocab.items()}
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
def tokenize(self, text):
split_tokens = []
for token in text.lower().split(" "):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)
return split_tokens
def convert_tokens_to_ids(self, tokens):
return convert_by_vocab(self.vocab, tokens)
def convert_ids_to_tokens(self, ids):
return convert_by_vocab(self.inv_vocab, ids)
class WSSPTokenizer(object):
def __init__(self, vocab_file, sp_model_dir, word_dict, ws=True,
lower=True):
......
from .bert_tokenizer import BertTokenizer
from .bert_tokenizer import ErnieTinyTokenizer
此差异已折叠。
from collections import OrderedDict
import unicodedata
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = {}
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n").split("\t")[0]
vocab[token] = index
return vocab
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
def is_whitespace(char):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
cat = unicodedata.category(char)
if cat == "Zs":
return True
return False
def is_control(char):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if char == "\t" or char == "\n" or char == "\r":
return False
cat = unicodedata.category(char)
if cat.startswith("C"):
return True
return False
def is_punctuation(char):
"""Checks whether `chars` is a punctuation character."""
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (
cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
return True
cat = unicodedata.category(char)
if cat.startswith("P"):
return True
return False
def is_chinese_char(char):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
cp = ord(char)
if ((cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF) #
or (cp >= 0x20000 and cp <= 0x2A6DF) #
or (cp >= 0x2A700 and cp <= 0x2B73F) #
or (cp >= 0x2B740 and cp <= 0x2B81F) #
or (cp >= 0x2B820 and cp <= 0x2CEAF) #
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F) #
): #
return True
return False
......@@ -8,6 +8,7 @@ visualdl >= 2.0.0b
cma >= 2.7.0
sentencepiece
colorlog
tqdm
# pandas no longer support python2 in version 0.25 and above
pandas ; python_version >= "3"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册