未验证 提交 2ef7c1e9 编写于 作者: W wuzewu 提交者: GitHub

add high performance, dataloader and annotation (#406)

* use dataloader
上级 2865db04
......@@ -16,4 +16,4 @@ python -u reading_comprehension.py \
--warmup_proportion=0.1 \
--num_epoch=2 \
--max_seq_len=512 \
--use_data_parallel=True
--use_data_parallel=False
......@@ -46,6 +46,7 @@ from .module.manager import default_module_manager
from .io.type import DataType
from .finetune.task import BaseTask
from .finetune.task import ClassifierTask
from .finetune.task import TextClassifierTask
from .finetune.task import ImageClassifierTask
......
......@@ -409,7 +409,8 @@ class ReadingComprehensionTask(BaseTask):
def _build_net(self):
self.unique_ids = fluid.layers.data(
name="unique_ids", shape=[-1, 1], lod_level=0, dtype="int64")
# to avoid memory optimization
_ = fluid.layers.assign(self.unique_ids)
logits = fluid.layers.fc(
input=self.feature,
size=2,
......
......@@ -64,17 +64,17 @@ class SequenceLabelTask(BaseTask):
return True
def _build_net(self):
self.seq_len = fluid.layers.data(
name="seq_len", shape=[1], dtype='int64', lod_level=0)
if version_compare(paddle.__version__, "1.6"):
self.seq_len = fluid.layers.data(
name="seq_len", shape=[-1], dtype='int64')
self.seq_len_used = fluid.layers.squeeze(self.seq_len, axes=[1])
else:
self.seq_len = fluid.layers.data(
name="seq_len", shape=[1], dtype='int64')
seq_len = fluid.layers.assign(self.seq_len)
self.seq_len_used = self.seq_len
if self.add_crf:
unpad_feature = fluid.layers.sequence_unpad(
self.feature, length=self.seq_len)
self.feature, length=self.seq_len_used)
self.emission = fluid.layers.fc(
size=self.num_classes,
input=unpad_feature,
......@@ -103,7 +103,6 @@ class SequenceLabelTask(BaseTask):
self.ret_infers = fluid.layers.reshape(
x=fluid.layers.argmax(self.logits, axis=2), shape=[-1, 1])
ret_infers = fluid.layers.assign(self.ret_infers)
logits = self.logits
logits = fluid.layers.flatten(logits, axis=2)
......@@ -118,7 +117,8 @@ class SequenceLabelTask(BaseTask):
def _add_loss(self):
if self.add_crf:
labels = fluid.layers.sequence_unpad(self.labels[0], self.seq_len)
labels = fluid.layers.sequence_unpad(self.labels[0],
self.seq_len_used)
crf_cost = fluid.layers.linear_chain_crf(
input=self.emission,
label=labels,
......@@ -133,7 +133,8 @@ class SequenceLabelTask(BaseTask):
def _add_metrics(self):
if self.add_crf:
labels = fluid.layers.sequence_unpad(self.labels[0], self.seq_len)
labels = fluid.layers.sequence_unpad(self.labels[0],
self.seq_len_used)
(precision, recall, f1_score, num_infer_chunks, num_label_chunks,
num_correct_chunks) = fluid.layers.chunk_eval(
input=self.outputs[0],
......@@ -146,7 +147,7 @@ class SequenceLabelTask(BaseTask):
else:
self.ret_labels = fluid.layers.reshape(
x=self.labels[0], shape=[-1, 1])
return [self.ret_labels, self.ret_infers, self.seq_len]
return [self.ret_labels, self.ret_infers, self.seq_len_used]
def _calculate_metrics(self, run_states):
total_infer = total_label = total_correct = loss_sum = 0
......@@ -214,7 +215,7 @@ class SequenceLabelTask(BaseTask):
if self.is_train_phase or self.is_test_phase:
return [metric.name for metric in self.metrics] + [self.loss.name]
elif self.is_predict_phase:
return [self.ret_infers.name] + [self.seq_len.name]
return [self.ret_infers.name] + [self.seq_len_used.name]
return [output.name for output in self.outputs]
def _postprocessing(self, run_states):
......
#coding:utf-8
# coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
......@@ -77,7 +77,8 @@ class ImageClassificationReader(BaseReader):
batch_size=1,
phase="train",
shuffle=False,
data=None):
data=None,
return_list=True):
if phase != 'predict' and not self.dataset:
raise ValueError("The dataset is none and it's not allowed!")
if phase == "train":
......@@ -135,14 +136,48 @@ class ImageClassificationReader(BaseReader):
def _data_reader():
if shuffle:
np.random.shuffle(data)
images = []
labels = []
if phase == "predict":
for image_path in data:
image = preprocess(image_path)
yield (image, )
images.append(image.astype('float32'))
if len(images) == batch_size:
# predictor must receive numpy array not list
images = np.array([images]).astype('float32')
if return_list:
# for DataFeeder
yield [images]
else:
# for DataLoader
yield images
images = []
if images:
images = np.array([images]).astype('float32')
if return_list:
yield [images]
else:
yield images
images = []
else:
for image_path, label in data:
image = preprocess(image_path)
yield (image, label)
return paddle.batch(_data_reader, batch_size=batch_size)
images.append(image.astype('float32'))
labels.append([int(label)])
if len(images) == batch_size:
if return_list:
yield [[images, labels]]
else:
yield [images, labels]
images = []
labels = []
if images:
if return_list:
yield [[images, labels]]
else:
yield [images, labels]
images = []
labels = []
return _data_reader
......@@ -22,7 +22,7 @@ import numpy as np
import six
from collections import namedtuple
import paddle
import paddle.fluid as fluid
from paddlehub.reader import tokenization
from paddlehub.common.logger import logger
......@@ -203,7 +203,8 @@ class BaseNLPReader(BaseReader):
batch_size=1,
phase='train',
shuffle=True,
data=None):
data=None,
return_list=True):
if phase != 'predict' and not self.dataset:
raise ValueError("The dataset is None ! It isn't allowed.")
if phase == 'train':
......@@ -255,7 +256,12 @@ class BaseNLPReader(BaseReader):
for batch_data in self._prepare_batch_data(
examples, batch_size, phase=phase):
yield [batch_data]
if return_list:
# for DataFeeder
yield [batch_data]
else:
# for DataLoader
yield batch_data
return wrapper
......@@ -666,7 +672,8 @@ class RegressionReader(BaseNLPReader):
batch_size=1,
phase='train',
shuffle=True,
data=None):
data=None,
return_list=True):
if phase != 'predict' and not self.dataset:
raise ValueError("The dataset is none and it's not allowed.")
if phase == 'train':
......@@ -715,7 +722,12 @@ class RegressionReader(BaseNLPReader):
for batch_data in self._prepare_batch_data(
examples, batch_size, phase=phase):
yield [batch_data]
if return_list:
# for DataFeeder
yield [batch_data]
else:
# for DataLoader
yield batch_data
return wrapper
......@@ -884,7 +896,8 @@ class ReadingComprehensionReader(BaseNLPReader):
batch_size=1,
phase='train',
shuffle=False,
data=None):
data=None,
return_list=True):
# we need all_examples and all_features in write_prediction in reading_comprehension_task
# we can also use all_examples and all_features to avoid duplicate long-time preprocessing
examples = None
......@@ -926,7 +939,12 @@ class ReadingComprehensionReader(BaseNLPReader):
for batch_data in self._prepare_batch_data(
features, batch_size, phase=phase):
yield [batch_data]
if return_list:
# for DataFeeder
yield [batch_data]
else:
# for DataLoader
yield batch_data
return wrapper
......@@ -1147,12 +1165,20 @@ class LACClassifyReader(BaseReader):
self.feed_key = list(
self.lac.processor.data_format(
sign_name="lexical_analysis").keys())[0]
self.has_processed = {
"train": False,
"dev": False,
"val": False,
"test": False,
"predict": False
}
def data_generator(self,
batch_size=1,
phase="train",
shuffle=False,
data=None):
data=None,
return_list=True):
if phase != "predict" and not self.dataset:
raise ValueError("The dataset is None and it isn't allowed.")
if phase == "train":
......@@ -1180,32 +1206,96 @@ class LACClassifyReader(BaseReader):
self.vocab[word] for word in processed[0]['word']
if word in self.vocab
]
if len(processed) == 0:
if six.PY2:
text = text.encode(sys_stdout_encoding())
logger.warning(
"The words in text %s can't be found in the vocabulary." %
(text))
return processed
if not self.has_processed[phase]:
logger.info(
"processing %s data now... this may take a few minutes" % phase)
for i in range(len(data)):
if phase == "predict":
data[i] = preprocess(data[i])
else:
data[i].text_a = preprocess(data[i].text_a)
if self.label_map:
if data[i].label not in self.label_map:
raise KeyError("example.label = {%s} not in label" %
data[i].label)
label_id = self.label_map[data[i].label]
else:
label_id = data[i].label
data[i].label = label_id
self.has_processed[phase] = True
def _data_reader():
if shuffle:
np.random.shuffle(data)
texts = []
labels = []
if phase == "predict":
for text in data:
text = preprocess(text)
if not text:
continue
yield (text, )
texts.append(text)
if len(texts) == batch_size:
if return_list:
# for DataFeeder
# if you want to use high-performance predictor, yield [[[t] for t in texts]]
yield [[t] for t in texts]
else:
# for DataLoader
# cannot use in high-performance predictor, as PaddleTensor rejects lod_tensor
texts = fluid.create_lod_tensor(
texts, [[len(seq) for seq in texts]],
fluid.CPUPlace())
yield [texts]
texts = []
if texts:
if return_list:
yield [[t] for t in texts]
else:
texts = fluid.create_lod_tensor(
texts, [[len(seq) for seq in texts]],
fluid.CPUPlace())
yield [texts]
texts = []
else:
for item in data:
text = preprocess(item.text_a)
text = item.text_a
if not text:
continue
yield (text, item.label)
return paddle.batch(_data_reader, batch_size=batch_size)
texts.append(text)
labels.append([item.label])
if len(texts) == batch_size:
if return_list:
yield list(zip(texts, labels))
else:
texts = fluid.create_lod_tensor(
texts, [[len(seq) for seq in texts]],
fluid.CPUPlace())
yield [texts, labels]
texts = []
labels = []
if texts:
if return_list:
yield list(zip(texts, labels))
else:
texts = fluid.create_lod_tensor(
texts, [[len(seq) for seq in texts]],
fluid.CPUPlace())
yield [texts, labels]
texts = []
labels = []
return _data_reader
if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册