Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleHub
提交
2ef7c1e9
P
PaddleHub
项目概览
PaddlePaddle
/
PaddleHub
接近 2 年 前同步成功
通知
284
Star
12117
Fork
2091
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
200
列表
看板
标记
里程碑
合并请求
4
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleHub
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
200
Issue
200
列表
看板
标记
里程碑
合并请求
4
合并请求
4
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
2ef7c1e9
编写于
3月 05, 2020
作者:
W
wuzewu
提交者:
GitHub
3月 05, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add high performance, dataloader and annotation (#406)
* use dataloader
上级
2865db04
变更
7
展开全部
隐藏空白更改
内联
并排
Showing
7 changed file
with
477 addition
and
203 deletion
+477
-203
demo/reading_comprehension/run_finetune.sh
demo/reading_comprehension/run_finetune.sh
+1
-1
paddlehub/__init__.py
paddlehub/__init__.py
+1
-0
paddlehub/finetune/task/base_task.py
paddlehub/finetune/task/base_task.py
+314
-168
paddlehub/finetune/task/reading_comprehension_task.py
paddlehub/finetune/task/reading_comprehension_task.py
+2
-1
paddlehub/finetune/task/sequence_task.py
paddlehub/finetune/task/sequence_task.py
+12
-11
paddlehub/reader/cv_reader.py
paddlehub/reader/cv_reader.py
+42
-7
paddlehub/reader/nlp_reader.py
paddlehub/reader/nlp_reader.py
+105
-15
未找到文件。
demo/reading_comprehension/run_finetune.sh
浏览文件 @
2ef7c1e9
...
...
@@ -16,4 +16,4 @@ python -u reading_comprehension.py \
--warmup_proportion
=
0.1
\
--num_epoch
=
2
\
--max_seq_len
=
512
\
--use_data_parallel
=
Tru
e
--use_data_parallel
=
Fals
e
paddlehub/__init__.py
浏览文件 @
2ef7c1e9
...
...
@@ -46,6 +46,7 @@ from .module.manager import default_module_manager
from
.io.type
import
DataType
from
.finetune.task
import
BaseTask
from
.finetune.task
import
ClassifierTask
from
.finetune.task
import
TextClassifierTask
from
.finetune.task
import
ImageClassifierTask
...
...
paddlehub/finetune/task/base_task.py
浏览文件 @
2ef7c1e9
此差异已折叠。
点击以展开。
paddlehub/finetune/task/reading_comprehension_task.py
浏览文件 @
2ef7c1e9
...
...
@@ -409,7 +409,8 @@ class ReadingComprehensionTask(BaseTask):
def
_build_net
(
self
):
self
.
unique_ids
=
fluid
.
layers
.
data
(
name
=
"unique_ids"
,
shape
=
[
-
1
,
1
],
lod_level
=
0
,
dtype
=
"int64"
)
# to avoid memory optimization
_
=
fluid
.
layers
.
assign
(
self
.
unique_ids
)
logits
=
fluid
.
layers
.
fc
(
input
=
self
.
feature
,
size
=
2
,
...
...
paddlehub/finetune/task/sequence_task.py
浏览文件 @
2ef7c1e9
...
...
@@ -64,17 +64,17 @@ class SequenceLabelTask(BaseTask):
return
True
def
_build_net
(
self
):
self
.
seq_len
=
fluid
.
layers
.
data
(
name
=
"seq_len"
,
shape
=
[
1
],
dtype
=
'int64'
,
lod_level
=
0
)
if
version_compare
(
paddle
.
__version__
,
"1.6"
):
self
.
seq_len
=
fluid
.
layers
.
data
(
name
=
"seq_len"
,
shape
=
[
-
1
],
dtype
=
'int64'
)
self
.
seq_len_used
=
fluid
.
layers
.
squeeze
(
self
.
seq_len
,
axes
=
[
1
])
else
:
self
.
seq_len
=
fluid
.
layers
.
data
(
name
=
"seq_len"
,
shape
=
[
1
],
dtype
=
'int64'
)
seq_len
=
fluid
.
layers
.
assign
(
self
.
seq_len
)
self
.
seq_len_used
=
self
.
seq_len
if
self
.
add_crf
:
unpad_feature
=
fluid
.
layers
.
sequence_unpad
(
self
.
feature
,
length
=
self
.
seq_len
)
self
.
feature
,
length
=
self
.
seq_len
_used
)
self
.
emission
=
fluid
.
layers
.
fc
(
size
=
self
.
num_classes
,
input
=
unpad_feature
,
...
...
@@ -103,7 +103,6 @@ class SequenceLabelTask(BaseTask):
self
.
ret_infers
=
fluid
.
layers
.
reshape
(
x
=
fluid
.
layers
.
argmax
(
self
.
logits
,
axis
=
2
),
shape
=
[
-
1
,
1
])
ret_infers
=
fluid
.
layers
.
assign
(
self
.
ret_infers
)
logits
=
self
.
logits
logits
=
fluid
.
layers
.
flatten
(
logits
,
axis
=
2
)
...
...
@@ -118,7 +117,8 @@ class SequenceLabelTask(BaseTask):
def
_add_loss
(
self
):
if
self
.
add_crf
:
labels
=
fluid
.
layers
.
sequence_unpad
(
self
.
labels
[
0
],
self
.
seq_len
)
labels
=
fluid
.
layers
.
sequence_unpad
(
self
.
labels
[
0
],
self
.
seq_len_used
)
crf_cost
=
fluid
.
layers
.
linear_chain_crf
(
input
=
self
.
emission
,
label
=
labels
,
...
...
@@ -133,7 +133,8 @@ class SequenceLabelTask(BaseTask):
def
_add_metrics
(
self
):
if
self
.
add_crf
:
labels
=
fluid
.
layers
.
sequence_unpad
(
self
.
labels
[
0
],
self
.
seq_len
)
labels
=
fluid
.
layers
.
sequence_unpad
(
self
.
labels
[
0
],
self
.
seq_len_used
)
(
precision
,
recall
,
f1_score
,
num_infer_chunks
,
num_label_chunks
,
num_correct_chunks
)
=
fluid
.
layers
.
chunk_eval
(
input
=
self
.
outputs
[
0
],
...
...
@@ -146,7 +147,7 @@ class SequenceLabelTask(BaseTask):
else
:
self
.
ret_labels
=
fluid
.
layers
.
reshape
(
x
=
self
.
labels
[
0
],
shape
=
[
-
1
,
1
])
return
[
self
.
ret_labels
,
self
.
ret_infers
,
self
.
seq_len
]
return
[
self
.
ret_labels
,
self
.
ret_infers
,
self
.
seq_len
_used
]
def
_calculate_metrics
(
self
,
run_states
):
total_infer
=
total_label
=
total_correct
=
loss_sum
=
0
...
...
@@ -214,7 +215,7 @@ class SequenceLabelTask(BaseTask):
if
self
.
is_train_phase
or
self
.
is_test_phase
:
return
[
metric
.
name
for
metric
in
self
.
metrics
]
+
[
self
.
loss
.
name
]
elif
self
.
is_predict_phase
:
return
[
self
.
ret_infers
.
name
]
+
[
self
.
seq_len
.
name
]
return
[
self
.
ret_infers
.
name
]
+
[
self
.
seq_len
_used
.
name
]
return
[
output
.
name
for
output
in
self
.
outputs
]
def
_postprocessing
(
self
,
run_states
):
...
...
paddlehub/reader/cv_reader.py
浏览文件 @
2ef7c1e9
#coding:utf-8
#
coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
...
...
@@ -77,7 +77,8 @@ class ImageClassificationReader(BaseReader):
batch_size
=
1
,
phase
=
"train"
,
shuffle
=
False
,
data
=
None
):
data
=
None
,
return_list
=
True
):
if
phase
!=
'predict'
and
not
self
.
dataset
:
raise
ValueError
(
"The dataset is none and it's not allowed!"
)
if
phase
==
"train"
:
...
...
@@ -135,14 +136,48 @@ class ImageClassificationReader(BaseReader):
def
_data_reader
():
if
shuffle
:
np
.
random
.
shuffle
(
data
)
images
=
[]
labels
=
[]
if
phase
==
"predict"
:
for
image_path
in
data
:
image
=
preprocess
(
image_path
)
yield
(
image
,
)
images
.
append
(
image
.
astype
(
'float32'
))
if
len
(
images
)
==
batch_size
:
# predictor must receive numpy array not list
images
=
np
.
array
([
images
]).
astype
(
'float32'
)
if
return_list
:
# for DataFeeder
yield
[
images
]
else
:
# for DataLoader
yield
images
images
=
[]
if
images
:
images
=
np
.
array
([
images
]).
astype
(
'float32'
)
if
return_list
:
yield
[
images
]
else
:
yield
images
images
=
[]
else
:
for
image_path
,
label
in
data
:
image
=
preprocess
(
image_path
)
yield
(
image
,
label
)
return
paddle
.
batch
(
_data_reader
,
batch_size
=
batch_size
)
images
.
append
(
image
.
astype
(
'float32'
))
labels
.
append
([
int
(
label
)])
if
len
(
images
)
==
batch_size
:
if
return_list
:
yield
[[
images
,
labels
]]
else
:
yield
[
images
,
labels
]
images
=
[]
labels
=
[]
if
images
:
if
return_list
:
yield
[[
images
,
labels
]]
else
:
yield
[
images
,
labels
]
images
=
[]
labels
=
[]
return
_data_reader
paddlehub/reader/nlp_reader.py
浏览文件 @
2ef7c1e9
...
...
@@ -22,7 +22,7 @@ import numpy as np
import
six
from
collections
import
namedtuple
import
paddle
import
paddle
.fluid
as
fluid
from
paddlehub.reader
import
tokenization
from
paddlehub.common.logger
import
logger
...
...
@@ -203,7 +203,8 @@ class BaseNLPReader(BaseReader):
batch_size
=
1
,
phase
=
'train'
,
shuffle
=
True
,
data
=
None
):
data
=
None
,
return_list
=
True
):
if
phase
!=
'predict'
and
not
self
.
dataset
:
raise
ValueError
(
"The dataset is None ! It isn't allowed."
)
if
phase
==
'train'
:
...
...
@@ -255,7 +256,12 @@ class BaseNLPReader(BaseReader):
for
batch_data
in
self
.
_prepare_batch_data
(
examples
,
batch_size
,
phase
=
phase
):
yield
[
batch_data
]
if
return_list
:
# for DataFeeder
yield
[
batch_data
]
else
:
# for DataLoader
yield
batch_data
return
wrapper
...
...
@@ -666,7 +672,8 @@ class RegressionReader(BaseNLPReader):
batch_size
=
1
,
phase
=
'train'
,
shuffle
=
True
,
data
=
None
):
data
=
None
,
return_list
=
True
):
if
phase
!=
'predict'
and
not
self
.
dataset
:
raise
ValueError
(
"The dataset is none and it's not allowed."
)
if
phase
==
'train'
:
...
...
@@ -715,7 +722,12 @@ class RegressionReader(BaseNLPReader):
for
batch_data
in
self
.
_prepare_batch_data
(
examples
,
batch_size
,
phase
=
phase
):
yield
[
batch_data
]
if
return_list
:
# for DataFeeder
yield
[
batch_data
]
else
:
# for DataLoader
yield
batch_data
return
wrapper
...
...
@@ -884,7 +896,8 @@ class ReadingComprehensionReader(BaseNLPReader):
batch_size
=
1
,
phase
=
'train'
,
shuffle
=
False
,
data
=
None
):
data
=
None
,
return_list
=
True
):
# we need all_examples and all_features in write_prediction in reading_comprehension_task
# we can also use all_examples and all_features to avoid duplicate long-time preprocessing
examples
=
None
...
...
@@ -926,7 +939,12 @@ class ReadingComprehensionReader(BaseNLPReader):
for
batch_data
in
self
.
_prepare_batch_data
(
features
,
batch_size
,
phase
=
phase
):
yield
[
batch_data
]
if
return_list
:
# for DataFeeder
yield
[
batch_data
]
else
:
# for DataLoader
yield
batch_data
return
wrapper
...
...
@@ -1147,12 +1165,20 @@ class LACClassifyReader(BaseReader):
self
.
feed_key
=
list
(
self
.
lac
.
processor
.
data_format
(
sign_name
=
"lexical_analysis"
).
keys
())[
0
]
self
.
has_processed
=
{
"train"
:
False
,
"dev"
:
False
,
"val"
:
False
,
"test"
:
False
,
"predict"
:
False
}
def
data_generator
(
self
,
batch_size
=
1
,
phase
=
"train"
,
shuffle
=
False
,
data
=
None
):
data
=
None
,
return_list
=
True
):
if
phase
!=
"predict"
and
not
self
.
dataset
:
raise
ValueError
(
"The dataset is None and it isn't allowed."
)
if
phase
==
"train"
:
...
...
@@ -1180,32 +1206,96 @@ class LACClassifyReader(BaseReader):
self
.
vocab
[
word
]
for
word
in
processed
[
0
][
'word'
]
if
word
in
self
.
vocab
]
if
len
(
processed
)
==
0
:
if
six
.
PY2
:
text
=
text
.
encode
(
sys_stdout_encoding
())
logger
.
warning
(
"The words in text %s can't be found in the vocabulary."
%
(
text
))
return
processed
if
not
self
.
has_processed
[
phase
]:
logger
.
info
(
"processing %s data now... this may take a few minutes"
%
phase
)
for
i
in
range
(
len
(
data
)):
if
phase
==
"predict"
:
data
[
i
]
=
preprocess
(
data
[
i
])
else
:
data
[
i
].
text_a
=
preprocess
(
data
[
i
].
text_a
)
if
self
.
label_map
:
if
data
[
i
].
label
not
in
self
.
label_map
:
raise
KeyError
(
"example.label = {%s} not in label"
%
data
[
i
].
label
)
label_id
=
self
.
label_map
[
data
[
i
].
label
]
else
:
label_id
=
data
[
i
].
label
data
[
i
].
label
=
label_id
self
.
has_processed
[
phase
]
=
True
def
_data_reader
():
if
shuffle
:
np
.
random
.
shuffle
(
data
)
texts
=
[]
labels
=
[]
if
phase
==
"predict"
:
for
text
in
data
:
text
=
preprocess
(
text
)
if
not
text
:
continue
yield
(
text
,
)
texts
.
append
(
text
)
if
len
(
texts
)
==
batch_size
:
if
return_list
:
# for DataFeeder
# if you want to use high-performance predictor, yield [[[t] for t in texts]]
yield
[[
t
]
for
t
in
texts
]
else
:
# for DataLoader
# cannot use in high-performance predictor, as PaddleTensor rejects lod_tensor
texts
=
fluid
.
create_lod_tensor
(
texts
,
[[
len
(
seq
)
for
seq
in
texts
]],
fluid
.
CPUPlace
())
yield
[
texts
]
texts
=
[]
if
texts
:
if
return_list
:
yield
[[
t
]
for
t
in
texts
]
else
:
texts
=
fluid
.
create_lod_tensor
(
texts
,
[[
len
(
seq
)
for
seq
in
texts
]],
fluid
.
CPUPlace
())
yield
[
texts
]
texts
=
[]
else
:
for
item
in
data
:
text
=
preprocess
(
item
.
text_a
)
text
=
item
.
text_a
if
not
text
:
continue
yield
(
text
,
item
.
label
)
return
paddle
.
batch
(
_data_reader
,
batch_size
=
batch_size
)
texts
.
append
(
text
)
labels
.
append
([
item
.
label
])
if
len
(
texts
)
==
batch_size
:
if
return_list
:
yield
list
(
zip
(
texts
,
labels
))
else
:
texts
=
fluid
.
create_lod_tensor
(
texts
,
[[
len
(
seq
)
for
seq
in
texts
]],
fluid
.
CPUPlace
())
yield
[
texts
,
labels
]
texts
=
[]
labels
=
[]
if
texts
:
if
return_list
:
yield
list
(
zip
(
texts
,
labels
))
else
:
texts
=
fluid
.
create_lod_tensor
(
texts
,
[[
len
(
seq
)
for
seq
in
texts
]],
fluid
.
CPUPlace
())
yield
[
texts
,
labels
]
texts
=
[]
labels
=
[]
return
_data_reader
if
__name__
==
'__main__'
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录