未验证 提交 2ef7c1e9 编写于 作者: W wuzewu 提交者: GitHub

add high performance, dataloader and annotation (#406)

* use dataloader
上级 2865db04
...@@ -16,4 +16,4 @@ python -u reading_comprehension.py \ ...@@ -16,4 +16,4 @@ python -u reading_comprehension.py \
--warmup_proportion=0.1 \ --warmup_proportion=0.1 \
--num_epoch=2 \ --num_epoch=2 \
--max_seq_len=512 \ --max_seq_len=512 \
--use_data_parallel=True --use_data_parallel=False
...@@ -46,6 +46,7 @@ from .module.manager import default_module_manager ...@@ -46,6 +46,7 @@ from .module.manager import default_module_manager
from .io.type import DataType from .io.type import DataType
from .finetune.task import BaseTask
from .finetune.task import ClassifierTask from .finetune.task import ClassifierTask
from .finetune.task import TextClassifierTask from .finetune.task import TextClassifierTask
from .finetune.task import ImageClassifierTask from .finetune.task import ImageClassifierTask
......
...@@ -409,7 +409,8 @@ class ReadingComprehensionTask(BaseTask): ...@@ -409,7 +409,8 @@ class ReadingComprehensionTask(BaseTask):
def _build_net(self): def _build_net(self):
self.unique_ids = fluid.layers.data( self.unique_ids = fluid.layers.data(
name="unique_ids", shape=[-1, 1], lod_level=0, dtype="int64") name="unique_ids", shape=[-1, 1], lod_level=0, dtype="int64")
# to avoid memory optimization
_ = fluid.layers.assign(self.unique_ids)
logits = fluid.layers.fc( logits = fluid.layers.fc(
input=self.feature, input=self.feature,
size=2, size=2,
......
...@@ -64,17 +64,17 @@ class SequenceLabelTask(BaseTask): ...@@ -64,17 +64,17 @@ class SequenceLabelTask(BaseTask):
return True return True
def _build_net(self): def _build_net(self):
self.seq_len = fluid.layers.data(
name="seq_len", shape=[1], dtype='int64', lod_level=0)
if version_compare(paddle.__version__, "1.6"): if version_compare(paddle.__version__, "1.6"):
self.seq_len = fluid.layers.data( self.seq_len_used = fluid.layers.squeeze(self.seq_len, axes=[1])
name="seq_len", shape=[-1], dtype='int64')
else: else:
self.seq_len = fluid.layers.data( self.seq_len_used = self.seq_len
name="seq_len", shape=[1], dtype='int64')
seq_len = fluid.layers.assign(self.seq_len)
if self.add_crf: if self.add_crf:
unpad_feature = fluid.layers.sequence_unpad( unpad_feature = fluid.layers.sequence_unpad(
self.feature, length=self.seq_len) self.feature, length=self.seq_len_used)
self.emission = fluid.layers.fc( self.emission = fluid.layers.fc(
size=self.num_classes, size=self.num_classes,
input=unpad_feature, input=unpad_feature,
...@@ -103,7 +103,6 @@ class SequenceLabelTask(BaseTask): ...@@ -103,7 +103,6 @@ class SequenceLabelTask(BaseTask):
self.ret_infers = fluid.layers.reshape( self.ret_infers = fluid.layers.reshape(
x=fluid.layers.argmax(self.logits, axis=2), shape=[-1, 1]) x=fluid.layers.argmax(self.logits, axis=2), shape=[-1, 1])
ret_infers = fluid.layers.assign(self.ret_infers)
logits = self.logits logits = self.logits
logits = fluid.layers.flatten(logits, axis=2) logits = fluid.layers.flatten(logits, axis=2)
...@@ -118,7 +117,8 @@ class SequenceLabelTask(BaseTask): ...@@ -118,7 +117,8 @@ class SequenceLabelTask(BaseTask):
def _add_loss(self): def _add_loss(self):
if self.add_crf: if self.add_crf:
labels = fluid.layers.sequence_unpad(self.labels[0], self.seq_len) labels = fluid.layers.sequence_unpad(self.labels[0],
self.seq_len_used)
crf_cost = fluid.layers.linear_chain_crf( crf_cost = fluid.layers.linear_chain_crf(
input=self.emission, input=self.emission,
label=labels, label=labels,
...@@ -133,7 +133,8 @@ class SequenceLabelTask(BaseTask): ...@@ -133,7 +133,8 @@ class SequenceLabelTask(BaseTask):
def _add_metrics(self): def _add_metrics(self):
if self.add_crf: if self.add_crf:
labels = fluid.layers.sequence_unpad(self.labels[0], self.seq_len) labels = fluid.layers.sequence_unpad(self.labels[0],
self.seq_len_used)
(precision, recall, f1_score, num_infer_chunks, num_label_chunks, (precision, recall, f1_score, num_infer_chunks, num_label_chunks,
num_correct_chunks) = fluid.layers.chunk_eval( num_correct_chunks) = fluid.layers.chunk_eval(
input=self.outputs[0], input=self.outputs[0],
...@@ -146,7 +147,7 @@ class SequenceLabelTask(BaseTask): ...@@ -146,7 +147,7 @@ class SequenceLabelTask(BaseTask):
else: else:
self.ret_labels = fluid.layers.reshape( self.ret_labels = fluid.layers.reshape(
x=self.labels[0], shape=[-1, 1]) x=self.labels[0], shape=[-1, 1])
return [self.ret_labels, self.ret_infers, self.seq_len] return [self.ret_labels, self.ret_infers, self.seq_len_used]
def _calculate_metrics(self, run_states): def _calculate_metrics(self, run_states):
total_infer = total_label = total_correct = loss_sum = 0 total_infer = total_label = total_correct = loss_sum = 0
...@@ -214,7 +215,7 @@ class SequenceLabelTask(BaseTask): ...@@ -214,7 +215,7 @@ class SequenceLabelTask(BaseTask):
if self.is_train_phase or self.is_test_phase: if self.is_train_phase or self.is_test_phase:
return [metric.name for metric in self.metrics] + [self.loss.name] return [metric.name for metric in self.metrics] + [self.loss.name]
elif self.is_predict_phase: elif self.is_predict_phase:
return [self.ret_infers.name] + [self.seq_len.name] return [self.ret_infers.name] + [self.seq_len_used.name]
return [output.name for output in self.outputs] return [output.name for output in self.outputs]
def _postprocessing(self, run_states): def _postprocessing(self, run_states):
......
#coding:utf-8 # coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License" # Licensed under the Apache License, Version 2.0 (the "License"
...@@ -77,7 +77,8 @@ class ImageClassificationReader(BaseReader): ...@@ -77,7 +77,8 @@ class ImageClassificationReader(BaseReader):
batch_size=1, batch_size=1,
phase="train", phase="train",
shuffle=False, shuffle=False,
data=None): data=None,
return_list=True):
if phase != 'predict' and not self.dataset: if phase != 'predict' and not self.dataset:
raise ValueError("The dataset is none and it's not allowed!") raise ValueError("The dataset is none and it's not allowed!")
if phase == "train": if phase == "train":
...@@ -135,14 +136,48 @@ class ImageClassificationReader(BaseReader): ...@@ -135,14 +136,48 @@ class ImageClassificationReader(BaseReader):
def _data_reader(): def _data_reader():
if shuffle: if shuffle:
np.random.shuffle(data) np.random.shuffle(data)
images = []
labels = []
if phase == "predict": if phase == "predict":
for image_path in data: for image_path in data:
image = preprocess(image_path) image = preprocess(image_path)
yield (image, ) images.append(image.astype('float32'))
if len(images) == batch_size:
# predictor must receive numpy array not list
images = np.array([images]).astype('float32')
if return_list:
# for DataFeeder
yield [images]
else:
# for DataLoader
yield images
images = []
if images:
images = np.array([images]).astype('float32')
if return_list:
yield [images]
else:
yield images
images = []
else: else:
for image_path, label in data: for image_path, label in data:
image = preprocess(image_path) image = preprocess(image_path)
yield (image, label) images.append(image.astype('float32'))
labels.append([int(label)])
return paddle.batch(_data_reader, batch_size=batch_size)
if len(images) == batch_size:
if return_list:
yield [[images, labels]]
else:
yield [images, labels]
images = []
labels = []
if images:
if return_list:
yield [[images, labels]]
else:
yield [images, labels]
images = []
labels = []
return _data_reader
...@@ -22,7 +22,7 @@ import numpy as np ...@@ -22,7 +22,7 @@ import numpy as np
import six import six
from collections import namedtuple from collections import namedtuple
import paddle import paddle.fluid as fluid
from paddlehub.reader import tokenization from paddlehub.reader import tokenization
from paddlehub.common.logger import logger from paddlehub.common.logger import logger
...@@ -203,7 +203,8 @@ class BaseNLPReader(BaseReader): ...@@ -203,7 +203,8 @@ class BaseNLPReader(BaseReader):
batch_size=1, batch_size=1,
phase='train', phase='train',
shuffle=True, shuffle=True,
data=None): data=None,
return_list=True):
if phase != 'predict' and not self.dataset: if phase != 'predict' and not self.dataset:
raise ValueError("The dataset is None ! It isn't allowed.") raise ValueError("The dataset is None ! It isn't allowed.")
if phase == 'train': if phase == 'train':
...@@ -255,7 +256,12 @@ class BaseNLPReader(BaseReader): ...@@ -255,7 +256,12 @@ class BaseNLPReader(BaseReader):
for batch_data in self._prepare_batch_data( for batch_data in self._prepare_batch_data(
examples, batch_size, phase=phase): examples, batch_size, phase=phase):
yield [batch_data] if return_list:
# for DataFeeder
yield [batch_data]
else:
# for DataLoader
yield batch_data
return wrapper return wrapper
...@@ -666,7 +672,8 @@ class RegressionReader(BaseNLPReader): ...@@ -666,7 +672,8 @@ class RegressionReader(BaseNLPReader):
batch_size=1, batch_size=1,
phase='train', phase='train',
shuffle=True, shuffle=True,
data=None): data=None,
return_list=True):
if phase != 'predict' and not self.dataset: if phase != 'predict' and not self.dataset:
raise ValueError("The dataset is none and it's not allowed.") raise ValueError("The dataset is none and it's not allowed.")
if phase == 'train': if phase == 'train':
...@@ -715,7 +722,12 @@ class RegressionReader(BaseNLPReader): ...@@ -715,7 +722,12 @@ class RegressionReader(BaseNLPReader):
for batch_data in self._prepare_batch_data( for batch_data in self._prepare_batch_data(
examples, batch_size, phase=phase): examples, batch_size, phase=phase):
yield [batch_data] if return_list:
# for DataFeeder
yield [batch_data]
else:
# for DataLoader
yield batch_data
return wrapper return wrapper
...@@ -884,7 +896,8 @@ class ReadingComprehensionReader(BaseNLPReader): ...@@ -884,7 +896,8 @@ class ReadingComprehensionReader(BaseNLPReader):
batch_size=1, batch_size=1,
phase='train', phase='train',
shuffle=False, shuffle=False,
data=None): data=None,
return_list=True):
# we need all_examples and all_features in write_prediction in reading_comprehension_task # we need all_examples and all_features in write_prediction in reading_comprehension_task
# we can also use all_examples and all_features to avoid duplicate long-time preprocessing # we can also use all_examples and all_features to avoid duplicate long-time preprocessing
examples = None examples = None
...@@ -926,7 +939,12 @@ class ReadingComprehensionReader(BaseNLPReader): ...@@ -926,7 +939,12 @@ class ReadingComprehensionReader(BaseNLPReader):
for batch_data in self._prepare_batch_data( for batch_data in self._prepare_batch_data(
features, batch_size, phase=phase): features, batch_size, phase=phase):
yield [batch_data] if return_list:
# for DataFeeder
yield [batch_data]
else:
# for DataLoader
yield batch_data
return wrapper return wrapper
...@@ -1147,12 +1165,20 @@ class LACClassifyReader(BaseReader): ...@@ -1147,12 +1165,20 @@ class LACClassifyReader(BaseReader):
self.feed_key = list( self.feed_key = list(
self.lac.processor.data_format( self.lac.processor.data_format(
sign_name="lexical_analysis").keys())[0] sign_name="lexical_analysis").keys())[0]
self.has_processed = {
"train": False,
"dev": False,
"val": False,
"test": False,
"predict": False
}
def data_generator(self, def data_generator(self,
batch_size=1, batch_size=1,
phase="train", phase="train",
shuffle=False, shuffle=False,
data=None): data=None,
return_list=True):
if phase != "predict" and not self.dataset: if phase != "predict" and not self.dataset:
raise ValueError("The dataset is None and it isn't allowed.") raise ValueError("The dataset is None and it isn't allowed.")
if phase == "train": if phase == "train":
...@@ -1180,32 +1206,96 @@ class LACClassifyReader(BaseReader): ...@@ -1180,32 +1206,96 @@ class LACClassifyReader(BaseReader):
self.vocab[word] for word in processed[0]['word'] self.vocab[word] for word in processed[0]['word']
if word in self.vocab if word in self.vocab
] ]
if len(processed) == 0: if len(processed) == 0:
if six.PY2: if six.PY2:
text = text.encode(sys_stdout_encoding()) text = text.encode(sys_stdout_encoding())
logger.warning( logger.warning(
"The words in text %s can't be found in the vocabulary." % "The words in text %s can't be found in the vocabulary." %
(text)) (text))
return processed return processed
if not self.has_processed[phase]:
logger.info(
"processing %s data now... this may take a few minutes" % phase)
for i in range(len(data)):
if phase == "predict":
data[i] = preprocess(data[i])
else:
data[i].text_a = preprocess(data[i].text_a)
if self.label_map:
if data[i].label not in self.label_map:
raise KeyError("example.label = {%s} not in label" %
data[i].label)
label_id = self.label_map[data[i].label]
else:
label_id = data[i].label
data[i].label = label_id
self.has_processed[phase] = True
def _data_reader(): def _data_reader():
if shuffle: if shuffle:
np.random.shuffle(data) np.random.shuffle(data)
texts = []
labels = []
if phase == "predict": if phase == "predict":
for text in data: for text in data:
text = preprocess(text)
if not text: if not text:
continue continue
yield (text, ) texts.append(text)
if len(texts) == batch_size:
if return_list:
# for DataFeeder
# if you want to use high-performance predictor, yield [[[t] for t in texts]]
yield [[t] for t in texts]
else:
# for DataLoader
# cannot use in high-performance predictor, as PaddleTensor rejects lod_tensor
texts = fluid.create_lod_tensor(
texts, [[len(seq) for seq in texts]],
fluid.CPUPlace())
yield [texts]
texts = []
if texts:
if return_list:
yield [[t] for t in texts]
else:
texts = fluid.create_lod_tensor(
texts, [[len(seq) for seq in texts]],
fluid.CPUPlace())
yield [texts]
texts = []
else: else:
for item in data: for item in data:
text = preprocess(item.text_a) text = item.text_a
if not text: if not text:
continue continue
yield (text, item.label) texts.append(text)
labels.append([item.label])
return paddle.batch(_data_reader, batch_size=batch_size) if len(texts) == batch_size:
if return_list:
yield list(zip(texts, labels))
else:
texts = fluid.create_lod_tensor(
texts, [[len(seq) for seq in texts]],
fluid.CPUPlace())
yield [texts, labels]
texts = []
labels = []
if texts:
if return_list:
yield list(zip(texts, labels))
else:
texts = fluid.create_lod_tensor(
texts, [[len(seq) for seq in texts]],
fluid.CPUPlace())
yield [texts, labels]
texts = []
labels = []
return _data_reader
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册