Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleHub
提交
2ef7c1e9
P
PaddleHub
项目概览
PaddlePaddle
/
PaddleHub
大约 2 年 前同步成功
通知
285
Star
12117
Fork
2091
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
200
列表
看板
标记
里程碑
合并请求
4
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleHub
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
200
Issue
200
列表
看板
标记
里程碑
合并请求
4
合并请求
4
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
2ef7c1e9
编写于
3月 05, 2020
作者:
W
wuzewu
提交者:
GitHub
3月 05, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add high performance, dataloader and annotation (#406)
* use dataloader
上级
2865db04
变更
7
展开全部
隐藏空白更改
内联
并排
Showing
7 changed file
with
477 addition
and
203 deletion
+477
-203
demo/reading_comprehension/run_finetune.sh
demo/reading_comprehension/run_finetune.sh
+1
-1
paddlehub/__init__.py
paddlehub/__init__.py
+1
-0
paddlehub/finetune/task/base_task.py
paddlehub/finetune/task/base_task.py
+314
-168
paddlehub/finetune/task/reading_comprehension_task.py
paddlehub/finetune/task/reading_comprehension_task.py
+2
-1
paddlehub/finetune/task/sequence_task.py
paddlehub/finetune/task/sequence_task.py
+12
-11
paddlehub/reader/cv_reader.py
paddlehub/reader/cv_reader.py
+42
-7
paddlehub/reader/nlp_reader.py
paddlehub/reader/nlp_reader.py
+105
-15
未找到文件。
demo/reading_comprehension/run_finetune.sh
浏览文件 @
2ef7c1e9
...
@@ -16,4 +16,4 @@ python -u reading_comprehension.py \
...
@@ -16,4 +16,4 @@ python -u reading_comprehension.py \
--warmup_proportion
=
0.1
\
--warmup_proportion
=
0.1
\
--num_epoch
=
2
\
--num_epoch
=
2
\
--max_seq_len
=
512
\
--max_seq_len
=
512
\
--use_data_parallel
=
Tru
e
--use_data_parallel
=
Fals
e
paddlehub/__init__.py
浏览文件 @
2ef7c1e9
...
@@ -46,6 +46,7 @@ from .module.manager import default_module_manager
...
@@ -46,6 +46,7 @@ from .module.manager import default_module_manager
from
.io.type
import
DataType
from
.io.type
import
DataType
from
.finetune.task
import
BaseTask
from
.finetune.task
import
ClassifierTask
from
.finetune.task
import
ClassifierTask
from
.finetune.task
import
TextClassifierTask
from
.finetune.task
import
TextClassifierTask
from
.finetune.task
import
ImageClassifierTask
from
.finetune.task
import
ImageClassifierTask
...
...
paddlehub/finetune/task/base_task.py
浏览文件 @
2ef7c1e9
此差异已折叠。
点击以展开。
paddlehub/finetune/task/reading_comprehension_task.py
浏览文件 @
2ef7c1e9
...
@@ -409,7 +409,8 @@ class ReadingComprehensionTask(BaseTask):
...
@@ -409,7 +409,8 @@ class ReadingComprehensionTask(BaseTask):
def
_build_net
(
self
):
def
_build_net
(
self
):
self
.
unique_ids
=
fluid
.
layers
.
data
(
self
.
unique_ids
=
fluid
.
layers
.
data
(
name
=
"unique_ids"
,
shape
=
[
-
1
,
1
],
lod_level
=
0
,
dtype
=
"int64"
)
name
=
"unique_ids"
,
shape
=
[
-
1
,
1
],
lod_level
=
0
,
dtype
=
"int64"
)
# to avoid memory optimization
_
=
fluid
.
layers
.
assign
(
self
.
unique_ids
)
logits
=
fluid
.
layers
.
fc
(
logits
=
fluid
.
layers
.
fc
(
input
=
self
.
feature
,
input
=
self
.
feature
,
size
=
2
,
size
=
2
,
...
...
paddlehub/finetune/task/sequence_task.py
浏览文件 @
2ef7c1e9
...
@@ -64,17 +64,17 @@ class SequenceLabelTask(BaseTask):
...
@@ -64,17 +64,17 @@ class SequenceLabelTask(BaseTask):
return
True
return
True
def
_build_net
(
self
):
def
_build_net
(
self
):
self
.
seq_len
=
fluid
.
layers
.
data
(
name
=
"seq_len"
,
shape
=
[
1
],
dtype
=
'int64'
,
lod_level
=
0
)
if
version_compare
(
paddle
.
__version__
,
"1.6"
):
if
version_compare
(
paddle
.
__version__
,
"1.6"
):
self
.
seq_len
=
fluid
.
layers
.
data
(
self
.
seq_len_used
=
fluid
.
layers
.
squeeze
(
self
.
seq_len
,
axes
=
[
1
])
name
=
"seq_len"
,
shape
=
[
-
1
],
dtype
=
'int64'
)
else
:
else
:
self
.
seq_len
=
fluid
.
layers
.
data
(
self
.
seq_len_used
=
self
.
seq_len
name
=
"seq_len"
,
shape
=
[
1
],
dtype
=
'int64'
)
seq_len
=
fluid
.
layers
.
assign
(
self
.
seq_len
)
if
self
.
add_crf
:
if
self
.
add_crf
:
unpad_feature
=
fluid
.
layers
.
sequence_unpad
(
unpad_feature
=
fluid
.
layers
.
sequence_unpad
(
self
.
feature
,
length
=
self
.
seq_len
)
self
.
feature
,
length
=
self
.
seq_len
_used
)
self
.
emission
=
fluid
.
layers
.
fc
(
self
.
emission
=
fluid
.
layers
.
fc
(
size
=
self
.
num_classes
,
size
=
self
.
num_classes
,
input
=
unpad_feature
,
input
=
unpad_feature
,
...
@@ -103,7 +103,6 @@ class SequenceLabelTask(BaseTask):
...
@@ -103,7 +103,6 @@ class SequenceLabelTask(BaseTask):
self
.
ret_infers
=
fluid
.
layers
.
reshape
(
self
.
ret_infers
=
fluid
.
layers
.
reshape
(
x
=
fluid
.
layers
.
argmax
(
self
.
logits
,
axis
=
2
),
shape
=
[
-
1
,
1
])
x
=
fluid
.
layers
.
argmax
(
self
.
logits
,
axis
=
2
),
shape
=
[
-
1
,
1
])
ret_infers
=
fluid
.
layers
.
assign
(
self
.
ret_infers
)
logits
=
self
.
logits
logits
=
self
.
logits
logits
=
fluid
.
layers
.
flatten
(
logits
,
axis
=
2
)
logits
=
fluid
.
layers
.
flatten
(
logits
,
axis
=
2
)
...
@@ -118,7 +117,8 @@ class SequenceLabelTask(BaseTask):
...
@@ -118,7 +117,8 @@ class SequenceLabelTask(BaseTask):
def
_add_loss
(
self
):
def
_add_loss
(
self
):
if
self
.
add_crf
:
if
self
.
add_crf
:
labels
=
fluid
.
layers
.
sequence_unpad
(
self
.
labels
[
0
],
self
.
seq_len
)
labels
=
fluid
.
layers
.
sequence_unpad
(
self
.
labels
[
0
],
self
.
seq_len_used
)
crf_cost
=
fluid
.
layers
.
linear_chain_crf
(
crf_cost
=
fluid
.
layers
.
linear_chain_crf
(
input
=
self
.
emission
,
input
=
self
.
emission
,
label
=
labels
,
label
=
labels
,
...
@@ -133,7 +133,8 @@ class SequenceLabelTask(BaseTask):
...
@@ -133,7 +133,8 @@ class SequenceLabelTask(BaseTask):
def
_add_metrics
(
self
):
def
_add_metrics
(
self
):
if
self
.
add_crf
:
if
self
.
add_crf
:
labels
=
fluid
.
layers
.
sequence_unpad
(
self
.
labels
[
0
],
self
.
seq_len
)
labels
=
fluid
.
layers
.
sequence_unpad
(
self
.
labels
[
0
],
self
.
seq_len_used
)
(
precision
,
recall
,
f1_score
,
num_infer_chunks
,
num_label_chunks
,
(
precision
,
recall
,
f1_score
,
num_infer_chunks
,
num_label_chunks
,
num_correct_chunks
)
=
fluid
.
layers
.
chunk_eval
(
num_correct_chunks
)
=
fluid
.
layers
.
chunk_eval
(
input
=
self
.
outputs
[
0
],
input
=
self
.
outputs
[
0
],
...
@@ -146,7 +147,7 @@ class SequenceLabelTask(BaseTask):
...
@@ -146,7 +147,7 @@ class SequenceLabelTask(BaseTask):
else
:
else
:
self
.
ret_labels
=
fluid
.
layers
.
reshape
(
self
.
ret_labels
=
fluid
.
layers
.
reshape
(
x
=
self
.
labels
[
0
],
shape
=
[
-
1
,
1
])
x
=
self
.
labels
[
0
],
shape
=
[
-
1
,
1
])
return
[
self
.
ret_labels
,
self
.
ret_infers
,
self
.
seq_len
]
return
[
self
.
ret_labels
,
self
.
ret_infers
,
self
.
seq_len
_used
]
def
_calculate_metrics
(
self
,
run_states
):
def
_calculate_metrics
(
self
,
run_states
):
total_infer
=
total_label
=
total_correct
=
loss_sum
=
0
total_infer
=
total_label
=
total_correct
=
loss_sum
=
0
...
@@ -214,7 +215,7 @@ class SequenceLabelTask(BaseTask):
...
@@ -214,7 +215,7 @@ class SequenceLabelTask(BaseTask):
if
self
.
is_train_phase
or
self
.
is_test_phase
:
if
self
.
is_train_phase
or
self
.
is_test_phase
:
return
[
metric
.
name
for
metric
in
self
.
metrics
]
+
[
self
.
loss
.
name
]
return
[
metric
.
name
for
metric
in
self
.
metrics
]
+
[
self
.
loss
.
name
]
elif
self
.
is_predict_phase
:
elif
self
.
is_predict_phase
:
return
[
self
.
ret_infers
.
name
]
+
[
self
.
seq_len
.
name
]
return
[
self
.
ret_infers
.
name
]
+
[
self
.
seq_len
_used
.
name
]
return
[
output
.
name
for
output
in
self
.
outputs
]
return
[
output
.
name
for
output
in
self
.
outputs
]
def
_postprocessing
(
self
,
run_states
):
def
_postprocessing
(
self
,
run_states
):
...
...
paddlehub/reader/cv_reader.py
浏览文件 @
2ef7c1e9
#coding:utf-8
#
coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License"
# Licensed under the Apache License, Version 2.0 (the "License"
...
@@ -77,7 +77,8 @@ class ImageClassificationReader(BaseReader):
...
@@ -77,7 +77,8 @@ class ImageClassificationReader(BaseReader):
batch_size
=
1
,
batch_size
=
1
,
phase
=
"train"
,
phase
=
"train"
,
shuffle
=
False
,
shuffle
=
False
,
data
=
None
):
data
=
None
,
return_list
=
True
):
if
phase
!=
'predict'
and
not
self
.
dataset
:
if
phase
!=
'predict'
and
not
self
.
dataset
:
raise
ValueError
(
"The dataset is none and it's not allowed!"
)
raise
ValueError
(
"The dataset is none and it's not allowed!"
)
if
phase
==
"train"
:
if
phase
==
"train"
:
...
@@ -135,14 +136,48 @@ class ImageClassificationReader(BaseReader):
...
@@ -135,14 +136,48 @@ class ImageClassificationReader(BaseReader):
def
_data_reader
():
def
_data_reader
():
if
shuffle
:
if
shuffle
:
np
.
random
.
shuffle
(
data
)
np
.
random
.
shuffle
(
data
)
images
=
[]
labels
=
[]
if
phase
==
"predict"
:
if
phase
==
"predict"
:
for
image_path
in
data
:
for
image_path
in
data
:
image
=
preprocess
(
image_path
)
image
=
preprocess
(
image_path
)
yield
(
image
,
)
images
.
append
(
image
.
astype
(
'float32'
))
if
len
(
images
)
==
batch_size
:
# predictor must receive numpy array not list
images
=
np
.
array
([
images
]).
astype
(
'float32'
)
if
return_list
:
# for DataFeeder
yield
[
images
]
else
:
# for DataLoader
yield
images
images
=
[]
if
images
:
images
=
np
.
array
([
images
]).
astype
(
'float32'
)
if
return_list
:
yield
[
images
]
else
:
yield
images
images
=
[]
else
:
else
:
for
image_path
,
label
in
data
:
for
image_path
,
label
in
data
:
image
=
preprocess
(
image_path
)
image
=
preprocess
(
image_path
)
yield
(
image
,
label
)
images
.
append
(
image
.
astype
(
'float32'
))
labels
.
append
([
int
(
label
)])
return
paddle
.
batch
(
_data_reader
,
batch_size
=
batch_size
)
if
len
(
images
)
==
batch_size
:
if
return_list
:
yield
[[
images
,
labels
]]
else
:
yield
[
images
,
labels
]
images
=
[]
labels
=
[]
if
images
:
if
return_list
:
yield
[[
images
,
labels
]]
else
:
yield
[
images
,
labels
]
images
=
[]
labels
=
[]
return
_data_reader
paddlehub/reader/nlp_reader.py
浏览文件 @
2ef7c1e9
...
@@ -22,7 +22,7 @@ import numpy as np
...
@@ -22,7 +22,7 @@ import numpy as np
import
six
import
six
from
collections
import
namedtuple
from
collections
import
namedtuple
import
paddle
import
paddle
.fluid
as
fluid
from
paddlehub.reader
import
tokenization
from
paddlehub.reader
import
tokenization
from
paddlehub.common.logger
import
logger
from
paddlehub.common.logger
import
logger
...
@@ -203,7 +203,8 @@ class BaseNLPReader(BaseReader):
...
@@ -203,7 +203,8 @@ class BaseNLPReader(BaseReader):
batch_size
=
1
,
batch_size
=
1
,
phase
=
'train'
,
phase
=
'train'
,
shuffle
=
True
,
shuffle
=
True
,
data
=
None
):
data
=
None
,
return_list
=
True
):
if
phase
!=
'predict'
and
not
self
.
dataset
:
if
phase
!=
'predict'
and
not
self
.
dataset
:
raise
ValueError
(
"The dataset is None ! It isn't allowed."
)
raise
ValueError
(
"The dataset is None ! It isn't allowed."
)
if
phase
==
'train'
:
if
phase
==
'train'
:
...
@@ -255,7 +256,12 @@ class BaseNLPReader(BaseReader):
...
@@ -255,7 +256,12 @@ class BaseNLPReader(BaseReader):
for
batch_data
in
self
.
_prepare_batch_data
(
for
batch_data
in
self
.
_prepare_batch_data
(
examples
,
batch_size
,
phase
=
phase
):
examples
,
batch_size
,
phase
=
phase
):
yield
[
batch_data
]
if
return_list
:
# for DataFeeder
yield
[
batch_data
]
else
:
# for DataLoader
yield
batch_data
return
wrapper
return
wrapper
...
@@ -666,7 +672,8 @@ class RegressionReader(BaseNLPReader):
...
@@ -666,7 +672,8 @@ class RegressionReader(BaseNLPReader):
batch_size
=
1
,
batch_size
=
1
,
phase
=
'train'
,
phase
=
'train'
,
shuffle
=
True
,
shuffle
=
True
,
data
=
None
):
data
=
None
,
return_list
=
True
):
if
phase
!=
'predict'
and
not
self
.
dataset
:
if
phase
!=
'predict'
and
not
self
.
dataset
:
raise
ValueError
(
"The dataset is none and it's not allowed."
)
raise
ValueError
(
"The dataset is none and it's not allowed."
)
if
phase
==
'train'
:
if
phase
==
'train'
:
...
@@ -715,7 +722,12 @@ class RegressionReader(BaseNLPReader):
...
@@ -715,7 +722,12 @@ class RegressionReader(BaseNLPReader):
for
batch_data
in
self
.
_prepare_batch_data
(
for
batch_data
in
self
.
_prepare_batch_data
(
examples
,
batch_size
,
phase
=
phase
):
examples
,
batch_size
,
phase
=
phase
):
yield
[
batch_data
]
if
return_list
:
# for DataFeeder
yield
[
batch_data
]
else
:
# for DataLoader
yield
batch_data
return
wrapper
return
wrapper
...
@@ -884,7 +896,8 @@ class ReadingComprehensionReader(BaseNLPReader):
...
@@ -884,7 +896,8 @@ class ReadingComprehensionReader(BaseNLPReader):
batch_size
=
1
,
batch_size
=
1
,
phase
=
'train'
,
phase
=
'train'
,
shuffle
=
False
,
shuffle
=
False
,
data
=
None
):
data
=
None
,
return_list
=
True
):
# we need all_examples and all_features in write_prediction in reading_comprehension_task
# we need all_examples and all_features in write_prediction in reading_comprehension_task
# we can also use all_examples and all_features to avoid duplicate long-time preprocessing
# we can also use all_examples and all_features to avoid duplicate long-time preprocessing
examples
=
None
examples
=
None
...
@@ -926,7 +939,12 @@ class ReadingComprehensionReader(BaseNLPReader):
...
@@ -926,7 +939,12 @@ class ReadingComprehensionReader(BaseNLPReader):
for
batch_data
in
self
.
_prepare_batch_data
(
for
batch_data
in
self
.
_prepare_batch_data
(
features
,
batch_size
,
phase
=
phase
):
features
,
batch_size
,
phase
=
phase
):
yield
[
batch_data
]
if
return_list
:
# for DataFeeder
yield
[
batch_data
]
else
:
# for DataLoader
yield
batch_data
return
wrapper
return
wrapper
...
@@ -1147,12 +1165,20 @@ class LACClassifyReader(BaseReader):
...
@@ -1147,12 +1165,20 @@ class LACClassifyReader(BaseReader):
self
.
feed_key
=
list
(
self
.
feed_key
=
list
(
self
.
lac
.
processor
.
data_format
(
self
.
lac
.
processor
.
data_format
(
sign_name
=
"lexical_analysis"
).
keys
())[
0
]
sign_name
=
"lexical_analysis"
).
keys
())[
0
]
self
.
has_processed
=
{
"train"
:
False
,
"dev"
:
False
,
"val"
:
False
,
"test"
:
False
,
"predict"
:
False
}
def
data_generator
(
self
,
def
data_generator
(
self
,
batch_size
=
1
,
batch_size
=
1
,
phase
=
"train"
,
phase
=
"train"
,
shuffle
=
False
,
shuffle
=
False
,
data
=
None
):
data
=
None
,
return_list
=
True
):
if
phase
!=
"predict"
and
not
self
.
dataset
:
if
phase
!=
"predict"
and
not
self
.
dataset
:
raise
ValueError
(
"The dataset is None and it isn't allowed."
)
raise
ValueError
(
"The dataset is None and it isn't allowed."
)
if
phase
==
"train"
:
if
phase
==
"train"
:
...
@@ -1180,32 +1206,96 @@ class LACClassifyReader(BaseReader):
...
@@ -1180,32 +1206,96 @@ class LACClassifyReader(BaseReader):
self
.
vocab
[
word
]
for
word
in
processed
[
0
][
'word'
]
self
.
vocab
[
word
]
for
word
in
processed
[
0
][
'word'
]
if
word
in
self
.
vocab
if
word
in
self
.
vocab
]
]
if
len
(
processed
)
==
0
:
if
len
(
processed
)
==
0
:
if
six
.
PY2
:
if
six
.
PY2
:
text
=
text
.
encode
(
sys_stdout_encoding
())
text
=
text
.
encode
(
sys_stdout_encoding
())
logger
.
warning
(
logger
.
warning
(
"The words in text %s can't be found in the vocabulary."
%
"The words in text %s can't be found in the vocabulary."
%
(
text
))
(
text
))
return
processed
return
processed
if
not
self
.
has_processed
[
phase
]:
logger
.
info
(
"processing %s data now... this may take a few minutes"
%
phase
)
for
i
in
range
(
len
(
data
)):
if
phase
==
"predict"
:
data
[
i
]
=
preprocess
(
data
[
i
])
else
:
data
[
i
].
text_a
=
preprocess
(
data
[
i
].
text_a
)
if
self
.
label_map
:
if
data
[
i
].
label
not
in
self
.
label_map
:
raise
KeyError
(
"example.label = {%s} not in label"
%
data
[
i
].
label
)
label_id
=
self
.
label_map
[
data
[
i
].
label
]
else
:
label_id
=
data
[
i
].
label
data
[
i
].
label
=
label_id
self
.
has_processed
[
phase
]
=
True
def
_data_reader
():
def
_data_reader
():
if
shuffle
:
if
shuffle
:
np
.
random
.
shuffle
(
data
)
np
.
random
.
shuffle
(
data
)
texts
=
[]
labels
=
[]
if
phase
==
"predict"
:
if
phase
==
"predict"
:
for
text
in
data
:
for
text
in
data
:
text
=
preprocess
(
text
)
if
not
text
:
if
not
text
:
continue
continue
yield
(
text
,
)
texts
.
append
(
text
)
if
len
(
texts
)
==
batch_size
:
if
return_list
:
# for DataFeeder
# if you want to use high-performance predictor, yield [[[t] for t in texts]]
yield
[[
t
]
for
t
in
texts
]
else
:
# for DataLoader
# cannot use in high-performance predictor, as PaddleTensor rejects lod_tensor
texts
=
fluid
.
create_lod_tensor
(
texts
,
[[
len
(
seq
)
for
seq
in
texts
]],
fluid
.
CPUPlace
())
yield
[
texts
]
texts
=
[]
if
texts
:
if
return_list
:
yield
[[
t
]
for
t
in
texts
]
else
:
texts
=
fluid
.
create_lod_tensor
(
texts
,
[[
len
(
seq
)
for
seq
in
texts
]],
fluid
.
CPUPlace
())
yield
[
texts
]
texts
=
[]
else
:
else
:
for
item
in
data
:
for
item
in
data
:
text
=
preprocess
(
item
.
text_a
)
text
=
item
.
text_a
if
not
text
:
if
not
text
:
continue
continue
yield
(
text
,
item
.
label
)
texts
.
append
(
text
)
labels
.
append
([
item
.
label
])
return
paddle
.
batch
(
_data_reader
,
batch_size
=
batch_size
)
if
len
(
texts
)
==
batch_size
:
if
return_list
:
yield
list
(
zip
(
texts
,
labels
))
else
:
texts
=
fluid
.
create_lod_tensor
(
texts
,
[[
len
(
seq
)
for
seq
in
texts
]],
fluid
.
CPUPlace
())
yield
[
texts
,
labels
]
texts
=
[]
labels
=
[]
if
texts
:
if
return_list
:
yield
list
(
zip
(
texts
,
labels
))
else
:
texts
=
fluid
.
create_lod_tensor
(
texts
,
[[
len
(
seq
)
for
seq
in
texts
]],
fluid
.
CPUPlace
())
yield
[
texts
,
labels
]
texts
=
[]
labels
=
[]
return
_data_reader
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录