Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleHub
提交
2ef7c1e9
P
PaddleHub
项目概览
PaddlePaddle
/
PaddleHub
大约 2 年 前同步成功
通知
285
Star
12117
Fork
2091
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
200
列表
看板
标记
里程碑
合并请求
4
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleHub
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
200
Issue
200
列表
看板
标记
里程碑
合并请求
4
合并请求
4
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
2ef7c1e9
编写于
3月 05, 2020
作者:
W
wuzewu
提交者:
GitHub
3月 05, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add high performance, dataloader and annotation (#406)
* use dataloader
上级
2865db04
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
477 addition
and
203 deletion
+477
-203
demo/reading_comprehension/run_finetune.sh
demo/reading_comprehension/run_finetune.sh
+1
-1
paddlehub/__init__.py
paddlehub/__init__.py
+1
-0
paddlehub/finetune/task/base_task.py
paddlehub/finetune/task/base_task.py
+314
-168
paddlehub/finetune/task/reading_comprehension_task.py
paddlehub/finetune/task/reading_comprehension_task.py
+2
-1
paddlehub/finetune/task/sequence_task.py
paddlehub/finetune/task/sequence_task.py
+12
-11
paddlehub/reader/cv_reader.py
paddlehub/reader/cv_reader.py
+42
-7
paddlehub/reader/nlp_reader.py
paddlehub/reader/nlp_reader.py
+105
-15
未找到文件。
demo/reading_comprehension/run_finetune.sh
浏览文件 @
2ef7c1e9
...
@@ -16,4 +16,4 @@ python -u reading_comprehension.py \
...
@@ -16,4 +16,4 @@ python -u reading_comprehension.py \
--warmup_proportion
=
0.1
\
--warmup_proportion
=
0.1
\
--num_epoch
=
2
\
--num_epoch
=
2
\
--max_seq_len
=
512
\
--max_seq_len
=
512
\
--use_data_parallel
=
Tru
e
--use_data_parallel
=
Fals
e
paddlehub/__init__.py
浏览文件 @
2ef7c1e9
...
@@ -46,6 +46,7 @@ from .module.manager import default_module_manager
...
@@ -46,6 +46,7 @@ from .module.manager import default_module_manager
from
.io.type
import
DataType
from
.io.type
import
DataType
from
.finetune.task
import
BaseTask
from
.finetune.task
import
ClassifierTask
from
.finetune.task
import
ClassifierTask
from
.finetune.task
import
TextClassifierTask
from
.finetune.task
import
TextClassifierTask
from
.finetune.task
import
ImageClassifierTask
from
.finetune.task
import
ImageClassifierTask
...
...
paddlehub/finetune/task/base_task.py
浏览文件 @
2ef7c1e9
...
@@ -21,7 +21,6 @@ import os
...
@@ -21,7 +21,6 @@ import os
import
contextlib
import
contextlib
import
time
import
time
import
copy
import
copy
import
logging
import
inspect
import
inspect
from
functools
import
partial
from
functools
import
partial
from
collections
import
OrderedDict
from
collections
import
OrderedDict
...
@@ -36,13 +35,21 @@ from tb_paddle import SummaryWriter
...
@@ -36,13 +35,21 @@ from tb_paddle import SummaryWriter
import
paddlehub
as
hub
import
paddlehub
as
hub
from
paddlehub.common.paddle_helper
import
dtype_map
,
clone_program
from
paddlehub.common.paddle_helper
import
dtype_map
,
clone_program
from
paddlehub.common.utils
import
mkdir
,
to_list
from
paddlehub.common.utils
import
mkdir
from
paddlehub.common.dir
import
tmp_dir
from
paddlehub.common.logger
import
logger
from
paddlehub.common.logger
import
logger
from
paddlehub.finetune.checkpoint
import
load_checkpoint
,
save_checkpoint
from
paddlehub.finetune.checkpoint
import
load_checkpoint
,
save_checkpoint
from
paddlehub.finetune.config
import
RunConfig
from
paddlehub.finetune.config
import
RunConfig
class
RunState
(
object
):
class
RunState
(
object
):
"""
RunState is used to save the result of every running step
Args:
length (int): the number of fetch result
"""
def
__init__
(
self
,
length
):
def
__init__
(
self
,
length
):
self
.
run_time_begin
=
time
.
time
()
self
.
run_time_begin
=
time
.
time
()
self
.
run_step
=
0
self
.
run_step
=
0
...
@@ -65,6 +72,10 @@ class RunState(object):
...
@@ -65,6 +72,10 @@ class RunState(object):
class
RunEnv
(
object
):
class
RunEnv
(
object
):
"""
RunEnv saves the running environment of the train/dev/predict phase, including program, reader, metrics and so on.
"""
def
__init__
(
self
):
def
__init__
(
self
):
self
.
current_epoch
=
0
self
.
current_epoch
=
0
self
.
current_step
=
0
self
.
current_step
=
0
...
@@ -87,6 +98,10 @@ class RunEnv(object):
...
@@ -87,6 +98,10 @@ class RunEnv(object):
class
TaskHooks
():
class
TaskHooks
():
"""
TaskHooks can handle some tasks during the spectific event.
"""
def
__init__
(
self
):
def
__init__
(
self
):
self
.
_registered_hooks
=
{
self
.
_registered_hooks
=
{
"build_env_start_event"
:
OrderedDict
(),
"build_env_start_event"
:
OrderedDict
(),
...
@@ -118,6 +133,14 @@ class TaskHooks():
...
@@ -118,6 +133,14 @@ class TaskHooks():
}
}
def
add
(
self
,
hook_type
,
name
=
None
,
func
=
None
):
def
add
(
self
,
hook_type
,
name
=
None
,
func
=
None
):
"""
add the handler function to spectific event.
Args:
hook_type (str): the spectific event name
name (str): the handler function name, default None
func (func): the handler function, default None
"""
if
not
func
or
not
callable
(
func
):
if
not
func
or
not
callable
(
func
):
raise
TypeError
(
raise
TypeError
(
"The hook function is empty or it is not a function"
)
"The hook function is empty or it is not a function"
)
...
@@ -142,6 +165,13 @@ class TaskHooks():
...
@@ -142,6 +165,13 @@ class TaskHooks():
self
.
_registered_hooks
[
hook_type
][
name
]
=
func
self
.
_registered_hooks
[
hook_type
][
name
]
=
func
def
delete
(
self
,
hook_type
,
name
):
def
delete
(
self
,
hook_type
,
name
):
"""
delete the handler function of spectific event.
Args:
hook_type (str): the spectific event name
name (str): the handler function name
"""
if
self
.
exist
(
hook_type
,
name
):
if
self
.
exist
(
hook_type
,
name
):
del
self
.
_registered_hooks
[
hook_type
][
name
]
del
self
.
_registered_hooks
[
hook_type
][
name
]
else
:
else
:
...
@@ -150,6 +180,14 @@ class TaskHooks():
...
@@ -150,6 +180,14 @@ class TaskHooks():
%
(
hook_type
,
name
,
hook_type
))
%
(
hook_type
,
name
,
hook_type
))
def
modify
(
self
,
hook_type
,
name
,
func
):
def
modify
(
self
,
hook_type
,
name
,
func
):
"""
modify the handler function of spectific event.
Args:
hook_type (str): the spectific event name
name (str): the handler function name
func (func): the new handler function
"""
if
not
(
isinstance
(
name
,
str
)
and
callable
(
func
)):
if
not
(
isinstance
(
name
,
str
)
and
callable
(
func
)):
raise
TypeError
(
raise
TypeError
(
"The hook name must be a string, and the hook function must be a function"
"The hook name must be a string, and the hook function must be a function"
...
@@ -162,6 +200,16 @@ class TaskHooks():
...
@@ -162,6 +200,16 @@ class TaskHooks():
%
(
hook_type
,
name
,
hook_type
))
%
(
hook_type
,
name
,
hook_type
))
def
exist
(
self
,
hook_type
,
name
):
def
exist
(
self
,
hook_type
,
name
):
"""
check if the the handler function of spectific event is existing.
Args:
hook_type (str): the spectific event name
name (str): the handler function name
Returns:
bool: True or False
"""
if
hook_type
not
in
self
.
_registered_hooks
\
if
hook_type
not
in
self
.
_registered_hooks
\
or
name
not
in
self
.
_registered_hooks
[
hook_type
]:
or
name
not
in
self
.
_registered_hooks
[
hook_type
]:
return
False
return
False
...
@@ -169,6 +217,15 @@ class TaskHooks():
...
@@ -169,6 +217,15 @@ class TaskHooks():
return
True
return
True
def
info
(
self
,
show_default
=
False
):
def
info
(
self
,
show_default
=
False
):
"""
get the hooks information, including the source code.
Args:
show_default (bool): show the information of Paddlehub default hooks or not, default False
Returns:
str: the formatted string of the hooks information
"""
# formatted output the source code
# formatted output the source code
ret
=
""
ret
=
""
for
hook_type
,
hooks
in
self
.
_registered_hooks
.
items
():
for
hook_type
,
hooks
in
self
.
_registered_hooks
.
items
():
...
@@ -194,10 +251,22 @@ class TaskHooks():
...
@@ -194,10 +251,22 @@ class TaskHooks():
return
self
.
_registered_hooks
[
hook_type
]
return
self
.
_registered_hooks
[
hook_type
]
def
__repr__
(
self
):
def
__repr__
(
self
):
return
self
.
info
(
only_customized
=
False
)
return
self
.
info
(
show_default
=
False
)
class
BaseTask
(
object
):
class
BaseTask
(
object
):
"""
BaseTask is the base class of all the task. It will complete the building of all the running environment.
Args:
feed_list (list): the inputs name
data_reader (object): data reader for the task
main_program (object): the customized main_program, default None
startup_program (object): the customized startup_program, default None
config (object): the config for the task, default None
metrics_choices (list): metrics used to the task, default ["acc"]
"""
def
__init__
(
self
,
def
__init__
(
self
,
feed_list
,
feed_list
,
data_reader
,
data_reader
,
...
@@ -205,7 +274,6 @@ class BaseTask(object):
...
@@ -205,7 +274,6 @@ class BaseTask(object):
startup_program
=
None
,
startup_program
=
None
,
config
=
None
,
config
=
None
,
metrics_choices
=
"default"
):
metrics_choices
=
"default"
):
# base item
# base item
self
.
_base_data_reader
=
data_reader
self
.
_base_data_reader
=
data_reader
self
.
_base_feed_list
=
feed_list
self
.
_base_feed_list
=
feed_list
...
@@ -270,6 +338,7 @@ class BaseTask(object):
...
@@ -270,6 +338,7 @@ class BaseTask(object):
# accelerate predict
# accelerate predict
self
.
is_best_model_loaded
=
False
self
.
is_best_model_loaded
=
False
self
.
_predictor
=
None
# set default phase
# set default phase
self
.
enter_phase
(
"train"
)
self
.
enter_phase
(
"train"
)
...
@@ -314,6 +383,9 @@ class BaseTask(object):
...
@@ -314,6 +383,9 @@ class BaseTask(object):
logger
.
info
(
"The best model has been loaded"
)
logger
.
info
(
"The best model has been loaded"
)
def
_build_env
(
self
):
def
_build_env
(
self
):
"""
building the program and strategy for specific running phase.
"""
if
self
.
env
.
is_inititalized
:
if
self
.
env
.
is_inititalized
:
return
return
...
@@ -338,46 +410,6 @@ class BaseTask(object):
...
@@ -338,46 +410,6 @@ class BaseTask(object):
hub
.
common
.
paddle_helper
.
set_op_attr
(
hub
.
common
.
paddle_helper
.
set_op_attr
(
self
.
env
.
main_program
,
is_test
=
True
)
self
.
env
.
main_program
,
is_test
=
True
)
if
self
.
config
.
use_pyreader
:
t_program
=
fluid
.
Program
()
with
fluid
.
program_guard
(
t_program
,
self
.
env
.
startup_program
):
self
.
env
.
py_reader
=
fluid
.
layers
.
py_reader
(
capacity
=
64
,
shapes
=
[
var
.
shape
for
var
in
self
.
feed_var_list
],
dtypes
=
[
dtype_map
[
var
.
dtype
]
for
var
in
self
.
feed_var_list
],
lod_levels
=
[
var
.
lod_level
for
var
in
self
.
feed_var_list
],
use_double_buffer
=
False
)
feed_var_list
=
self
.
feed_var_list
py_vars
=
fluid
.
layers
.
read_file
(
self
.
env
.
py_reader
)
py_vars
=
to_list
(
py_vars
)
input_dict
=
{
feed_var_list
[
index
].
name
:
py_var
for
index
,
py_var
in
enumerate
(
py_vars
)
}
hub
.
connect_program
(
pre_program
=
t_program
,
next_program
=
self
.
env
.
main_program
,
input_dict
=
input_dict
,
need_log
=
False
)
self
.
env
.
main_program
=
t_program
if
not
self
.
is_predict_phase
:
self
.
env
.
loss
=
self
.
env
.
main_program
.
global_block
().
vars
[
self
.
env
.
loss
.
name
]
metrics_name
=
[
var
.
name
for
var
in
self
.
env
.
metrics
]
self
.
env
.
metrics
=
[
self
.
env
.
main_program
.
global_block
().
vars
[
name
]
for
name
in
metrics_name
]
outputs_name
=
[
var
.
name
for
var
in
self
.
env
.
outputs
]
self
.
env
.
outputs
=
[
self
.
env
.
main_program
.
global_block
().
vars
[
name
]
for
name
in
outputs_name
]
if
self
.
config
.
enable_memory_optim
:
if
self
.
config
.
enable_memory_optim
:
for
var_name
in
self
.
fetch_list
:
for
var_name
in
self
.
fetch_list
:
var
=
self
.
env
.
main_program
.
global_block
().
vars
[
var_name
]
var
=
self
.
env
.
main_program
.
global_block
().
vars
[
var_name
]
...
@@ -405,7 +437,8 @@ class BaseTask(object):
...
@@ -405,7 +437,8 @@ class BaseTask(object):
self
.
env
.
main_program
).
with_data_parallel
(
self
.
env
.
main_program
).
with_data_parallel
(
loss_name
=
loss_name
,
loss_name
=
loss_name
,
share_vars_from
=
share_vars_from
,
share_vars_from
=
share_vars_from
,
build_strategy
=
self
.
build_strategy
)
build_strategy
=
self
.
build_strategy
,
places
=
self
.
places
)
self
.
exe
.
run
(
self
.
env
.
startup_program
)
self
.
exe
.
run
(
self
.
env
.
startup_program
)
self
.
_build_env_end_event
()
self
.
_build_env_end_event
()
...
@@ -501,7 +534,10 @@ class BaseTask(object):
...
@@ -501,7 +534,10 @@ class BaseTask(object):
else
:
else
:
data
=
None
data
=
None
self
.
env
.
reader
=
self
.
_base_data_reader
.
data_generator
(
self
.
env
.
reader
=
self
.
_base_data_reader
.
data_generator
(
batch_size
=
self
.
config
.
batch_size
,
phase
=
self
.
phase
,
data
=
data
)
batch_size
=
self
.
config
.
batch_size
,
phase
=
self
.
phase
,
data
=
data
,
return_list
=
not
self
.
config
.
use_pyreader
)
return
self
.
env
.
reader
return
self
.
env
.
reader
@
property
@
property
...
@@ -566,6 +602,9 @@ class BaseTask(object):
...
@@ -566,6 +602,9 @@ class BaseTask(object):
@
property
@
property
def
tb_writer
(
self
):
def
tb_writer
(
self
):
"""
get tb_writer for visualization.
"""
if
not
os
.
path
.
exists
(
self
.
config
.
checkpoint_dir
):
if
not
os
.
path
.
exists
(
self
.
config
.
checkpoint_dir
):
mkdir
(
self
.
config
.
checkpoint_dir
)
mkdir
(
self
.
config
.
checkpoint_dir
)
tb_log_dir
=
os
.
path
.
join
(
self
.
config
.
checkpoint_dir
,
"visualization"
)
tb_log_dir
=
os
.
path
.
join
(
self
.
config
.
checkpoint_dir
,
"visualization"
)
...
@@ -574,7 +613,18 @@ class BaseTask(object):
...
@@ -574,7 +613,18 @@ class BaseTask(object):
return
self
.
_tb_writer
return
self
.
_tb_writer
def
create_event_function
(
self
,
hook_type
):
def
create_event_function
(
self
,
hook_type
):
"""
create handlers for specific event.
Args:
hook_type (str): specific event name
Returns:
func: executable function, the class method will receive a parameter named self.
"""
def
hook_function
(
self
,
*
args
):
def
hook_function
(
self
,
*
args
):
# all the handler in self._hooks[hook_type] will be configured to executable
for
name
,
func
in
self
.
_hooks
[
hook_type
].
items
():
for
name
,
func
in
self
.
_hooks
[
hook_type
].
items
():
if
inspect
.
ismethod
(
func
):
if
inspect
.
ismethod
(
func
):
func
(
*
args
)
func
(
*
args
)
...
@@ -587,20 +637,52 @@ class BaseTask(object):
...
@@ -587,20 +637,52 @@ class BaseTask(object):
def
hooks
(
self
):
def
hooks
(
self
):
return
self
.
_hooks
return
self
.
_hooks
def
hooks_info
(
self
,
only_customized
=
True
):
def
hooks_info
(
self
,
show_default
=
False
):
return
self
.
_hooks
.
info
(
only_customized
)
"""
get the hooks information, including the source code.
Args:
show_default (bool): show the information of Paddlehub default hooks or not, default False
Returns:
str: the formatted string of the hooks information
"""
return
self
.
_hooks
.
info
(
show_default
)
def
add_hook
(
self
,
hook_type
,
name
=
None
,
func
=
None
):
def
add_hook
(
self
,
hook_type
,
name
=
None
,
func
=
None
):
"""
add the handler function to spectific event.
Args:
hook_type (str): the spectific event name
name (str): the handler function name, default None
func (func): the handler function, default None
"""
if
name
==
None
:
if
name
==
None
:
name
=
"hook_%s"
%
id
(
func
)
name
=
"hook_%s"
%
id
(
func
)
self
.
_hooks
.
add
(
hook_type
,
name
=
name
,
func
=
func
)
self
.
_hooks
.
add
(
hook_type
,
name
=
name
,
func
=
func
)
logger
.
info
(
"Add hook %s:%s successfully"
%
(
hook_type
,
name
))
logger
.
info
(
"Add hook %s:%s successfully"
%
(
hook_type
,
name
))
def
delete_hook
(
self
,
hook_type
,
name
):
def
delete_hook
(
self
,
hook_type
,
name
):
"""
delete the handler function of spectific event.
Args:
hook_type (str): the spectific event name
name (str): the handler function name
"""
self
.
_hooks
.
delete
(
hook_type
,
name
)
self
.
_hooks
.
delete
(
hook_type
,
name
)
logger
.
info
(
"Delete hook %s:%s successfully"
%
(
hook_type
,
name
))
logger
.
info
(
"Delete hook %s:%s successfully"
%
(
hook_type
,
name
))
def
modify_hook
(
self
,
hook_type
,
name
,
func
):
def
modify_hook
(
self
,
hook_type
,
name
,
func
):
"""
modify the handler function of spectific event.
Args:
hook_type (str): the spectific event name
name (str): the handler function name
func (func): the new handler function
"""
self
.
_hooks
.
modify
(
hook_type
,
name
,
func
)
self
.
_hooks
.
modify
(
hook_type
,
name
,
func
)
logger
.
info
(
"Modify hook %s:%s successfully"
%
(
hook_type
,
name
))
logger
.
info
(
"Modify hook %s:%s successfully"
%
(
hook_type
,
name
))
...
@@ -627,6 +709,12 @@ class BaseTask(object):
...
@@ -627,6 +709,12 @@ class BaseTask(object):
logger
.
info
(
"Evaluation on {} dataset start"
.
format
(
self
.
phase
))
logger
.
info
(
"Evaluation on {} dataset start"
.
format
(
self
.
phase
))
def
_default_eval_end_event
(
self
,
run_states
):
def
_default_eval_end_event
(
self
,
run_states
):
"""
Paddlehub default handler for eval_end_event, it will complete visualization and metrics calculation
Args:
run_states (object): the results in eval phase
"""
eval_scores
,
eval_loss
,
run_speed
=
self
.
_calculate_metrics
(
run_states
)
eval_scores
,
eval_loss
,
run_speed
=
self
.
_calculate_metrics
(
run_states
)
if
'train'
in
self
.
_envs
:
if
'train'
in
self
.
_envs
:
self
.
tb_writer
.
add_scalar
(
self
.
tb_writer
.
add_scalar
(
...
@@ -665,6 +753,12 @@ class BaseTask(object):
...
@@ -665,6 +753,12 @@ class BaseTask(object):
self
.
save_inference_model
(
dirname
=
model_saved_dir
)
self
.
save_inference_model
(
dirname
=
model_saved_dir
)
def
_default_log_interval_event
(
self
,
run_states
):
def
_default_log_interval_event
(
self
,
run_states
):
"""
PaddleHub default handler for log_interval_event, it will complete visualization.
Args:
run_states (object): the results in train phase
"""
scores
,
avg_loss
,
run_speed
=
self
.
_calculate_metrics
(
run_states
)
scores
,
avg_loss
,
run_speed
=
self
.
_calculate_metrics
(
run_states
)
self
.
tb_writer
.
add_scalar
(
self
.
tb_writer
.
add_scalar
(
tag
=
"Loss_{}"
.
format
(
self
.
phase
),
tag
=
"Loss_{}"
.
format
(
self
.
phase
),
...
@@ -763,6 +857,15 @@ class BaseTask(object):
...
@@ -763,6 +857,15 @@ class BaseTask(object):
return
self
.
finetune
(
do_eval
=
True
)
return
self
.
finetune
(
do_eval
=
True
)
def
finetune
(
self
,
do_eval
=
False
):
def
finetune
(
self
,
do_eval
=
False
):
"""
train and finetune the module parameters.
Args:
do_eval (bool): do eval during train phase or not
Returns:
RunState: the running result of train phase
"""
# Start to finetune
# Start to finetune
with
self
.
phase_guard
(
phase
=
"train"
):
with
self
.
phase_guard
(
phase
=
"train"
):
...
@@ -777,6 +880,9 @@ class BaseTask(object):
...
@@ -777,6 +880,9 @@ class BaseTask(object):
# Final evaluation
# Final evaluation
if
self
.
_base_data_reader
.
get_dev_examples
()
!=
[]:
if
self
.
_base_data_reader
.
get_dev_examples
()
!=
[]:
# Warning: DO NOT use self.eval(phase="dev", load_best_model=True) during training.
# It will cause trainer unable to continue training from checkpoint after eval.
# More important, The model should evaluate current performance during training.
self
.
eval
(
phase
=
"dev"
)
self
.
eval
(
phase
=
"dev"
)
if
self
.
_base_data_reader
.
get_test_examples
()
!=
[]:
if
self
.
_base_data_reader
.
get_test_examples
()
!=
[]:
self
.
eval
(
phase
=
"test"
,
load_best_model
=
True
)
self
.
eval
(
phase
=
"test"
,
load_best_model
=
True
)
...
@@ -787,6 +893,16 @@ class BaseTask(object):
...
@@ -787,6 +893,16 @@ class BaseTask(object):
return
run_states
return
run_states
def
eval
(
self
,
phase
=
"dev"
,
load_best_model
=
False
):
def
eval
(
self
,
phase
=
"dev"
,
load_best_model
=
False
):
"""
evaluate the performance of current module.
Args:
phase (str): current run phase
load_best_model (bool): load the best model or not
Returns:
RunState: the running result of eval phase
"""
# Warning: DO NOT use eval(load_best_model=True) in finetune_and_eval
# Warning: DO NOT use eval(load_best_model=True) in finetune_and_eval
# It will cause trainer unable to continue training from checkpoint after eval
# It will cause trainer unable to continue training from checkpoint after eval
# More important, The model should evaluate current performance during training.
# More important, The model should evaluate current performance during training.
...
@@ -800,15 +916,96 @@ class BaseTask(object):
...
@@ -800,15 +916,96 @@ class BaseTask(object):
self
.
_eval_end_event
(
run_states
)
self
.
_eval_end_event
(
run_states
)
return
run_states
return
run_states
def
predict
(
self
,
data
,
load_best_model
=
True
,
return_result
=
False
):
def
_create_predictor
(
self
):
"""
create high-performance predictor for predict.
Returns:
PaddlePredictor: the high-performance predictor
"""
with
tmp_dir
()
as
_dir
:
self
.
save_inference_model
(
dirname
=
_dir
)
predictor_config
=
fluid
.
core
.
AnalysisConfig
(
_dir
)
if
self
.
config
.
use_cuda
:
predictor_config
.
enable_use_gpu
(
100
,
0
)
predictor_config
.
switch_ir_optim
(
True
)
else
:
predictor_config
.
disable_gpu
()
predictor_config
.
enable_memory_optim
()
return
fluid
.
core
.
create_paddle_predictor
(
predictor_config
)
def
_run_with_predictor
(
self
):
"""
use high-performance predictor to make prediction.
Returns:
RunState: the running result of predict phase
"""
if
isinstance
(
self
.
_base_data_reader
,
hub
.
reader
.
LACClassifyReader
):
raise
Exception
(
"LACClassifyReader does not support predictor, please close accelerate_mode"
)
global_run_states
=
[]
period_run_states
=
[]
for
run_step
,
batch
in
enumerate
(
self
.
reader
(),
start
=
1
):
step_run_state
=
RunState
(
len
(
self
.
fetch_list
))
step_run_state
.
run_step
=
1
num_batch_examples
=
len
(
batch
)
if
not
self
.
config
.
use_pyreader
:
# if use pyreader, the nlp_reader return [batch]
batch
=
batch
[
0
]
batch
=
[
fluid
.
core
.
PaddleTensor
(
data
)
for
data
in
batch
]
fetch_result
=
self
.
_predictor
.
run
(
batch
)
for
index
,
result
in
enumerate
(
fetch_result
):
step_run_state
.
run_results
[
index
]
=
result
.
as_ndarray
()
step_run_state
.
run_examples
+=
num_batch_examples
step_run_state
.
update
()
period_run_states
+=
[
step_run_state
]
self
.
_run_step_event
(
step_run_state
)
global_run_states
+=
period_run_states
return
global_run_states
def
predict
(
self
,
data
,
load_best_model
=
True
,
return_result
=
False
,
accelerate_mode
=
False
):
"""
make prediction for the input data.
Args:
data (list): the data will be predicted.
load_best_model (bool): load the best model or not
return_result (bool): return a readable result or just the raw run result
accelerate_mode (bool): use high-performance predictor or not
Returns:
RunState: the running result of predict phase
"""
self
.
accelerate_mode
=
accelerate_mode
with
self
.
phase_guard
(
phase
=
"predict"
):
with
self
.
phase_guard
(
phase
=
"predict"
):
self
.
_predict_data
=
data
self
.
_predict_start_event
()
if
load_best_model
:
if
load_best_model
:
self
.
init_if_load_best_model
()
self
.
init_if_load_best_model
()
else
:
else
:
self
.
init_if_necessary
()
self
.
init_if_necessary
()
self
.
_predict_data
=
data
if
not
self
.
accelerate_mode
:
self
.
_predict_start_event
()
run_states
=
self
.
_run
()
run_states
=
self
.
_run
()
else
:
if
not
self
.
_predictor
:
self
.
_predictor
=
self
.
_create_predictor
()
run_states
=
self
.
_run_with_predictor
()
self
.
_predict_end_event
(
run_states
)
self
.
_predict_end_event
(
run_states
)
self
.
_predict_data
=
None
self
.
_predict_data
=
None
if
return_result
:
if
return_result
:
...
@@ -816,6 +1013,15 @@ class BaseTask(object):
...
@@ -816,6 +1013,15 @@ class BaseTask(object):
return
run_states
return
run_states
def
_postprocessing
(
self
,
run_states
):
def
_postprocessing
(
self
,
run_states
):
"""
postprocessing the run result, get readable result.
Args:
run_states (RunState): the raw run result to be processed
Returns:
list: readable result
"""
results
=
[]
results
=
[]
for
batch_state
in
run_states
:
for
batch_state
in
run_states
:
batch_result
=
batch_state
.
run_results
[
0
]
batch_result
=
batch_state
.
run_results
[
0
]
...
@@ -823,130 +1029,70 @@ class BaseTask(object):
...
@@ -823,130 +1029,70 @@ class BaseTask(object):
return
results
return
results
def
_run
(
self
,
do_eval
=
False
):
def
_run
(
self
,
do_eval
=
False
):
with
fluid
.
program_guard
(
self
.
main_program
,
self
.
startup_program
):
"""
if
self
.
config
.
use_pyreader
:
load data and run the program.
return
self
.
_run_with_py_reader
(
do_eval
=
do_eval
)
return
self
.
_run_with_data_feeder
(
do_eval
=
do_eval
)
def
_run_with_data_feeder
(
self
,
do_eval
=
False
):
data_feeder
=
fluid
.
DataFeeder
(
Args:
feed_list
=
self
.
feed_list
,
place
=
self
.
place
)
do_eval (bool): do eval during train phase or not
global_run_states
=
[]
Returns:
period_run_states
=
[]
RunState: the running result of specific phase
"""
with
fluid
.
program_guard
(
self
.
main_program
,
self
.
startup_program
):
if
self
.
config
.
use_pyreader
:
data_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
self
.
feed_var_list
,
capacity
=
64
,
use_double_buffer
=
True
,
iterable
=
True
)
data_reader
=
data_loader
.
set_batch_generator
(
self
.
reader
,
places
=
self
.
places
)
else
:
data_feeder
=
fluid
.
DataFeeder
(
feed_list
=
self
.
feed_list
,
place
=
self
.
place
)
data_reader
=
data_feeder
.
decorate_reader
(
self
.
reader
,
multi_devices
=
self
.
config
.
use_data_parallel
,
drop_last
=
True
)
parallel_batch
=
[]
global_run_states
=
[]
for
run_step
,
batch
in
enumerate
(
self
.
reader
(),
start
=
1
):
period_run_states
=
[]
if
self
.
config
.
use_data_parallel
:
parallel_batch
+=
batch
if
len
(
parallel_batch
)
<
self
.
device_count
:
continue
else
:
batch
=
parallel_batch
parallel_batch
=
[]
step_run_state
=
RunState
(
len
(
self
.
fetch_list
))
for
run_step
,
batch
in
enumerate
(
data_reader
(),
start
=
1
):
step_run_state
.
run_step
=
1
step_run_state
=
RunState
(
len
(
self
.
fetch_list
))
num_batch_examples
=
len
(
batch
)
step_run_state
.
run_step
=
1
num_batch_examples
=
len
(
batch
)
if
self
.
return_numpy
:
fetch_result
=
self
.
exe
.
run
(
self
.
main_program_to_be_run
,
feed
=
data_feeder
.
feed
(
batch
),
fetch_list
=
self
.
fetch_list
)
else
:
fetch_result
=
self
.
exe
.
run
(
fetch_result
=
self
.
exe
.
run
(
self
.
main_program_to_be_run
,
self
.
main_program_to_be_run
,
feed
=
data_feeder
.
feed
(
batch
)
,
feed
=
batch
,
fetch_list
=
self
.
fetch_list
,
fetch_list
=
self
.
fetch_list
,
return_numpy
=
False
)
return_numpy
=
self
.
return_numpy
)
fetch_result
=
[
np
.
array
(
x
)
for
x
in
fetch_result
]
if
not
self
.
return_numpy
:
fetch_result
=
[
np
.
array
(
x
)
for
x
in
fetch_result
]
for
index
,
result
in
enumerate
(
fetch_result
):
step_run_state
.
run_results
[
index
]
=
result
for
index
,
result
in
enumerate
(
fetch_result
):
step_run_state
.
run_examples
+=
num_batch_examples
step_run_state
.
run_results
[
index
]
=
result
step_run_state
.
update
()
step_run_state
.
run_examples
+=
num_batch_examples
period_run_states
+=
[
step_run_state
]
step_run_state
.
update
()
self
.
env
.
current_step
+=
1
period_run_states
+=
[
step_run_state
]
if
self
.
is_train_phase
:
self
.
env
.
current_step
+=
1
if
self
.
current_step
%
self
.
config
.
log_interval
==
0
:
if
self
.
is_train_phase
:
self
.
_log_interval_event
(
period_run_states
)
if
self
.
current_step
%
self
.
config
.
log_interval
==
0
:
global_run_states
+=
period_run_states
self
.
_log_interval_event
(
period_run_states
)
period_run_states
=
[]
global_run_states
+=
period_run_states
period_run_states
=
[]
if
self
.
config
.
save_ckpt_interval
and
self
.
current_step
%
self
.
config
.
save_ckpt_interval
==
0
:
self
.
_save_ckpt_interval_event
()
if
self
.
config
.
save_ckpt_interval
and
self
.
current_step
%
self
.
config
.
save_ckpt_interval
==
0
:
self
.
_save_ckpt_interval_event
()
if
do_eval
and
self
.
current_step
%
self
.
config
.
eval_interval
==
0
:
self
.
_eval_interval_event
()
if
do_eval
and
self
.
current_step
%
self
.
config
.
eval_interval
==
0
:
self
.
_eval_interval_event
()
self
.
_run_step_event
(
step_run_state
)
self
.
_run_step_event
(
step_run_state
)
global_run_states
+=
period_run_states
return
global_run_states
global_run_states
+=
period_run_states
return
global_run_states
def
_run_with_py_reader
(
self
,
do_eval
=
False
):
flag
=
False
use_data_parallel_backup
=
self
.
config
.
use_data_parallel
while
True
:
global_run_states
=
[]
period_run_states
=
[]
self
.
py_reader
.
decorate_paddle_reader
(
self
.
reader
)
self
.
py_reader
.
start
()
try
:
while
True
:
num_batch_examples
=
self
.
config
.
batch_size
*
self
.
device_count
step_run_state
=
RunState
(
len
(
self
.
fetch_list
))
step_run_state
.
run_step
=
1
if
self
.
return_numpy
:
fetch_result
=
self
.
exe
.
run
(
self
.
main_program_to_be_run
,
fetch_list
=
self
.
fetch_list
)
else
:
fetch_result
=
self
.
exe
.
run
(
self
.
main_program_to_be_run
,
fetch_list
=
self
.
fetch_list
,
return_numpy
=
False
)
fetch_result
=
[
np
.
array
(
x
)
for
x
in
fetch_result
]
for
index
,
result
in
enumerate
(
fetch_result
):
step_run_state
.
run_results
[
index
]
=
result
step_run_state
.
run_examples
+=
num_batch_examples
step_run_state
.
update
()
period_run_states
+=
[
step_run_state
]
self
.
env
.
current_step
+=
1
if
self
.
is_train_phase
:
if
self
.
current_step
%
self
.
config
.
log_interval
==
0
:
self
.
_log_interval_event
(
period_run_states
)
global_run_states
+=
period_run_states
period_run_states
=
[]
if
self
.
config
.
save_ckpt_interval
and
self
.
current_step
%
self
.
config
.
save_ckpt_interval
==
0
:
self
.
_save_ckpt_interval_event
()
if
do_eval
and
self
.
current_step
%
self
.
config
.
eval_interval
==
0
:
self
.
_eval_interval_event
()
self
.
_run_step_event
(
step_run_state
)
except
fluid
.
core
.
EOFException
:
global_run_states
+=
period_run_states
self
.
py_reader
.
reset
()
'''
When opening use_data_parallel and use_pyreader, if the amount of data is too small,
the reader will have thrown EOF Exception when not fetching to the running result.
In this case, temporarily close the use_data_parallel to get the result.
'''
if
flag
:
self
.
config
.
_use_data_parallel
=
use_data_parallel_backup
elif
len
(
global_run_states
)
==
0
:
flag
=
True
self
.
config
.
_use_data_parallel
=
False
continue
break
return
global_run_states
def
__repr__
(
self
):
def
__repr__
(
self
):
return
"Task: %s with metrics_choices: %s, reader: %s, %s"
%
(
return
"Task: %s with metrics_choices: %s, reader: %s, %s"
%
(
...
...
paddlehub/finetune/task/reading_comprehension_task.py
浏览文件 @
2ef7c1e9
...
@@ -409,7 +409,8 @@ class ReadingComprehensionTask(BaseTask):
...
@@ -409,7 +409,8 @@ class ReadingComprehensionTask(BaseTask):
def
_build_net
(
self
):
def
_build_net
(
self
):
self
.
unique_ids
=
fluid
.
layers
.
data
(
self
.
unique_ids
=
fluid
.
layers
.
data
(
name
=
"unique_ids"
,
shape
=
[
-
1
,
1
],
lod_level
=
0
,
dtype
=
"int64"
)
name
=
"unique_ids"
,
shape
=
[
-
1
,
1
],
lod_level
=
0
,
dtype
=
"int64"
)
# to avoid memory optimization
_
=
fluid
.
layers
.
assign
(
self
.
unique_ids
)
logits
=
fluid
.
layers
.
fc
(
logits
=
fluid
.
layers
.
fc
(
input
=
self
.
feature
,
input
=
self
.
feature
,
size
=
2
,
size
=
2
,
...
...
paddlehub/finetune/task/sequence_task.py
浏览文件 @
2ef7c1e9
...
@@ -64,17 +64,17 @@ class SequenceLabelTask(BaseTask):
...
@@ -64,17 +64,17 @@ class SequenceLabelTask(BaseTask):
return
True
return
True
def
_build_net
(
self
):
def
_build_net
(
self
):
self
.
seq_len
=
fluid
.
layers
.
data
(
name
=
"seq_len"
,
shape
=
[
1
],
dtype
=
'int64'
,
lod_level
=
0
)
if
version_compare
(
paddle
.
__version__
,
"1.6"
):
if
version_compare
(
paddle
.
__version__
,
"1.6"
):
self
.
seq_len
=
fluid
.
layers
.
data
(
self
.
seq_len_used
=
fluid
.
layers
.
squeeze
(
self
.
seq_len
,
axes
=
[
1
])
name
=
"seq_len"
,
shape
=
[
-
1
],
dtype
=
'int64'
)
else
:
else
:
self
.
seq_len
=
fluid
.
layers
.
data
(
self
.
seq_len_used
=
self
.
seq_len
name
=
"seq_len"
,
shape
=
[
1
],
dtype
=
'int64'
)
seq_len
=
fluid
.
layers
.
assign
(
self
.
seq_len
)
if
self
.
add_crf
:
if
self
.
add_crf
:
unpad_feature
=
fluid
.
layers
.
sequence_unpad
(
unpad_feature
=
fluid
.
layers
.
sequence_unpad
(
self
.
feature
,
length
=
self
.
seq_len
)
self
.
feature
,
length
=
self
.
seq_len
_used
)
self
.
emission
=
fluid
.
layers
.
fc
(
self
.
emission
=
fluid
.
layers
.
fc
(
size
=
self
.
num_classes
,
size
=
self
.
num_classes
,
input
=
unpad_feature
,
input
=
unpad_feature
,
...
@@ -103,7 +103,6 @@ class SequenceLabelTask(BaseTask):
...
@@ -103,7 +103,6 @@ class SequenceLabelTask(BaseTask):
self
.
ret_infers
=
fluid
.
layers
.
reshape
(
self
.
ret_infers
=
fluid
.
layers
.
reshape
(
x
=
fluid
.
layers
.
argmax
(
self
.
logits
,
axis
=
2
),
shape
=
[
-
1
,
1
])
x
=
fluid
.
layers
.
argmax
(
self
.
logits
,
axis
=
2
),
shape
=
[
-
1
,
1
])
ret_infers
=
fluid
.
layers
.
assign
(
self
.
ret_infers
)
logits
=
self
.
logits
logits
=
self
.
logits
logits
=
fluid
.
layers
.
flatten
(
logits
,
axis
=
2
)
logits
=
fluid
.
layers
.
flatten
(
logits
,
axis
=
2
)
...
@@ -118,7 +117,8 @@ class SequenceLabelTask(BaseTask):
...
@@ -118,7 +117,8 @@ class SequenceLabelTask(BaseTask):
def
_add_loss
(
self
):
def
_add_loss
(
self
):
if
self
.
add_crf
:
if
self
.
add_crf
:
labels
=
fluid
.
layers
.
sequence_unpad
(
self
.
labels
[
0
],
self
.
seq_len
)
labels
=
fluid
.
layers
.
sequence_unpad
(
self
.
labels
[
0
],
self
.
seq_len_used
)
crf_cost
=
fluid
.
layers
.
linear_chain_crf
(
crf_cost
=
fluid
.
layers
.
linear_chain_crf
(
input
=
self
.
emission
,
input
=
self
.
emission
,
label
=
labels
,
label
=
labels
,
...
@@ -133,7 +133,8 @@ class SequenceLabelTask(BaseTask):
...
@@ -133,7 +133,8 @@ class SequenceLabelTask(BaseTask):
def
_add_metrics
(
self
):
def
_add_metrics
(
self
):
if
self
.
add_crf
:
if
self
.
add_crf
:
labels
=
fluid
.
layers
.
sequence_unpad
(
self
.
labels
[
0
],
self
.
seq_len
)
labels
=
fluid
.
layers
.
sequence_unpad
(
self
.
labels
[
0
],
self
.
seq_len_used
)
(
precision
,
recall
,
f1_score
,
num_infer_chunks
,
num_label_chunks
,
(
precision
,
recall
,
f1_score
,
num_infer_chunks
,
num_label_chunks
,
num_correct_chunks
)
=
fluid
.
layers
.
chunk_eval
(
num_correct_chunks
)
=
fluid
.
layers
.
chunk_eval
(
input
=
self
.
outputs
[
0
],
input
=
self
.
outputs
[
0
],
...
@@ -146,7 +147,7 @@ class SequenceLabelTask(BaseTask):
...
@@ -146,7 +147,7 @@ class SequenceLabelTask(BaseTask):
else
:
else
:
self
.
ret_labels
=
fluid
.
layers
.
reshape
(
self
.
ret_labels
=
fluid
.
layers
.
reshape
(
x
=
self
.
labels
[
0
],
shape
=
[
-
1
,
1
])
x
=
self
.
labels
[
0
],
shape
=
[
-
1
,
1
])
return
[
self
.
ret_labels
,
self
.
ret_infers
,
self
.
seq_len
]
return
[
self
.
ret_labels
,
self
.
ret_infers
,
self
.
seq_len
_used
]
def
_calculate_metrics
(
self
,
run_states
):
def
_calculate_metrics
(
self
,
run_states
):
total_infer
=
total_label
=
total_correct
=
loss_sum
=
0
total_infer
=
total_label
=
total_correct
=
loss_sum
=
0
...
@@ -214,7 +215,7 @@ class SequenceLabelTask(BaseTask):
...
@@ -214,7 +215,7 @@ class SequenceLabelTask(BaseTask):
if
self
.
is_train_phase
or
self
.
is_test_phase
:
if
self
.
is_train_phase
or
self
.
is_test_phase
:
return
[
metric
.
name
for
metric
in
self
.
metrics
]
+
[
self
.
loss
.
name
]
return
[
metric
.
name
for
metric
in
self
.
metrics
]
+
[
self
.
loss
.
name
]
elif
self
.
is_predict_phase
:
elif
self
.
is_predict_phase
:
return
[
self
.
ret_infers
.
name
]
+
[
self
.
seq_len
.
name
]
return
[
self
.
ret_infers
.
name
]
+
[
self
.
seq_len
_used
.
name
]
return
[
output
.
name
for
output
in
self
.
outputs
]
return
[
output
.
name
for
output
in
self
.
outputs
]
def
_postprocessing
(
self
,
run_states
):
def
_postprocessing
(
self
,
run_states
):
...
...
paddlehub/reader/cv_reader.py
浏览文件 @
2ef7c1e9
#coding:utf-8
#
coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License"
# Licensed under the Apache License, Version 2.0 (the "License"
...
@@ -77,7 +77,8 @@ class ImageClassificationReader(BaseReader):
...
@@ -77,7 +77,8 @@ class ImageClassificationReader(BaseReader):
batch_size
=
1
,
batch_size
=
1
,
phase
=
"train"
,
phase
=
"train"
,
shuffle
=
False
,
shuffle
=
False
,
data
=
None
):
data
=
None
,
return_list
=
True
):
if
phase
!=
'predict'
and
not
self
.
dataset
:
if
phase
!=
'predict'
and
not
self
.
dataset
:
raise
ValueError
(
"The dataset is none and it's not allowed!"
)
raise
ValueError
(
"The dataset is none and it's not allowed!"
)
if
phase
==
"train"
:
if
phase
==
"train"
:
...
@@ -135,14 +136,48 @@ class ImageClassificationReader(BaseReader):
...
@@ -135,14 +136,48 @@ class ImageClassificationReader(BaseReader):
def
_data_reader
():
def
_data_reader
():
if
shuffle
:
if
shuffle
:
np
.
random
.
shuffle
(
data
)
np
.
random
.
shuffle
(
data
)
images
=
[]
labels
=
[]
if
phase
==
"predict"
:
if
phase
==
"predict"
:
for
image_path
in
data
:
for
image_path
in
data
:
image
=
preprocess
(
image_path
)
image
=
preprocess
(
image_path
)
yield
(
image
,
)
images
.
append
(
image
.
astype
(
'float32'
))
if
len
(
images
)
==
batch_size
:
# predictor must receive numpy array not list
images
=
np
.
array
([
images
]).
astype
(
'float32'
)
if
return_list
:
# for DataFeeder
yield
[
images
]
else
:
# for DataLoader
yield
images
images
=
[]
if
images
:
images
=
np
.
array
([
images
]).
astype
(
'float32'
)
if
return_list
:
yield
[
images
]
else
:
yield
images
images
=
[]
else
:
else
:
for
image_path
,
label
in
data
:
for
image_path
,
label
in
data
:
image
=
preprocess
(
image_path
)
image
=
preprocess
(
image_path
)
yield
(
image
,
label
)
images
.
append
(
image
.
astype
(
'float32'
))
labels
.
append
([
int
(
label
)])
return
paddle
.
batch
(
_data_reader
,
batch_size
=
batch_size
)
if
len
(
images
)
==
batch_size
:
if
return_list
:
yield
[[
images
,
labels
]]
else
:
yield
[
images
,
labels
]
images
=
[]
labels
=
[]
if
images
:
if
return_list
:
yield
[[
images
,
labels
]]
else
:
yield
[
images
,
labels
]
images
=
[]
labels
=
[]
return
_data_reader
paddlehub/reader/nlp_reader.py
浏览文件 @
2ef7c1e9
...
@@ -22,7 +22,7 @@ import numpy as np
...
@@ -22,7 +22,7 @@ import numpy as np
import
six
import
six
from
collections
import
namedtuple
from
collections
import
namedtuple
import
paddle
import
paddle
.fluid
as
fluid
from
paddlehub.reader
import
tokenization
from
paddlehub.reader
import
tokenization
from
paddlehub.common.logger
import
logger
from
paddlehub.common.logger
import
logger
...
@@ -203,7 +203,8 @@ class BaseNLPReader(BaseReader):
...
@@ -203,7 +203,8 @@ class BaseNLPReader(BaseReader):
batch_size
=
1
,
batch_size
=
1
,
phase
=
'train'
,
phase
=
'train'
,
shuffle
=
True
,
shuffle
=
True
,
data
=
None
):
data
=
None
,
return_list
=
True
):
if
phase
!=
'predict'
and
not
self
.
dataset
:
if
phase
!=
'predict'
and
not
self
.
dataset
:
raise
ValueError
(
"The dataset is None ! It isn't allowed."
)
raise
ValueError
(
"The dataset is None ! It isn't allowed."
)
if
phase
==
'train'
:
if
phase
==
'train'
:
...
@@ -255,7 +256,12 @@ class BaseNLPReader(BaseReader):
...
@@ -255,7 +256,12 @@ class BaseNLPReader(BaseReader):
for
batch_data
in
self
.
_prepare_batch_data
(
for
batch_data
in
self
.
_prepare_batch_data
(
examples
,
batch_size
,
phase
=
phase
):
examples
,
batch_size
,
phase
=
phase
):
yield
[
batch_data
]
if
return_list
:
# for DataFeeder
yield
[
batch_data
]
else
:
# for DataLoader
yield
batch_data
return
wrapper
return
wrapper
...
@@ -666,7 +672,8 @@ class RegressionReader(BaseNLPReader):
...
@@ -666,7 +672,8 @@ class RegressionReader(BaseNLPReader):
batch_size
=
1
,
batch_size
=
1
,
phase
=
'train'
,
phase
=
'train'
,
shuffle
=
True
,
shuffle
=
True
,
data
=
None
):
data
=
None
,
return_list
=
True
):
if
phase
!=
'predict'
and
not
self
.
dataset
:
if
phase
!=
'predict'
and
not
self
.
dataset
:
raise
ValueError
(
"The dataset is none and it's not allowed."
)
raise
ValueError
(
"The dataset is none and it's not allowed."
)
if
phase
==
'train'
:
if
phase
==
'train'
:
...
@@ -715,7 +722,12 @@ class RegressionReader(BaseNLPReader):
...
@@ -715,7 +722,12 @@ class RegressionReader(BaseNLPReader):
for
batch_data
in
self
.
_prepare_batch_data
(
for
batch_data
in
self
.
_prepare_batch_data
(
examples
,
batch_size
,
phase
=
phase
):
examples
,
batch_size
,
phase
=
phase
):
yield
[
batch_data
]
if
return_list
:
# for DataFeeder
yield
[
batch_data
]
else
:
# for DataLoader
yield
batch_data
return
wrapper
return
wrapper
...
@@ -884,7 +896,8 @@ class ReadingComprehensionReader(BaseNLPReader):
...
@@ -884,7 +896,8 @@ class ReadingComprehensionReader(BaseNLPReader):
batch_size
=
1
,
batch_size
=
1
,
phase
=
'train'
,
phase
=
'train'
,
shuffle
=
False
,
shuffle
=
False
,
data
=
None
):
data
=
None
,
return_list
=
True
):
# we need all_examples and all_features in write_prediction in reading_comprehension_task
# we need all_examples and all_features in write_prediction in reading_comprehension_task
# we can also use all_examples and all_features to avoid duplicate long-time preprocessing
# we can also use all_examples and all_features to avoid duplicate long-time preprocessing
examples
=
None
examples
=
None
...
@@ -926,7 +939,12 @@ class ReadingComprehensionReader(BaseNLPReader):
...
@@ -926,7 +939,12 @@ class ReadingComprehensionReader(BaseNLPReader):
for
batch_data
in
self
.
_prepare_batch_data
(
for
batch_data
in
self
.
_prepare_batch_data
(
features
,
batch_size
,
phase
=
phase
):
features
,
batch_size
,
phase
=
phase
):
yield
[
batch_data
]
if
return_list
:
# for DataFeeder
yield
[
batch_data
]
else
:
# for DataLoader
yield
batch_data
return
wrapper
return
wrapper
...
@@ -1147,12 +1165,20 @@ class LACClassifyReader(BaseReader):
...
@@ -1147,12 +1165,20 @@ class LACClassifyReader(BaseReader):
self
.
feed_key
=
list
(
self
.
feed_key
=
list
(
self
.
lac
.
processor
.
data_format
(
self
.
lac
.
processor
.
data_format
(
sign_name
=
"lexical_analysis"
).
keys
())[
0
]
sign_name
=
"lexical_analysis"
).
keys
())[
0
]
self
.
has_processed
=
{
"train"
:
False
,
"dev"
:
False
,
"val"
:
False
,
"test"
:
False
,
"predict"
:
False
}
def
data_generator
(
self
,
def
data_generator
(
self
,
batch_size
=
1
,
batch_size
=
1
,
phase
=
"train"
,
phase
=
"train"
,
shuffle
=
False
,
shuffle
=
False
,
data
=
None
):
data
=
None
,
return_list
=
True
):
if
phase
!=
"predict"
and
not
self
.
dataset
:
if
phase
!=
"predict"
and
not
self
.
dataset
:
raise
ValueError
(
"The dataset is None and it isn't allowed."
)
raise
ValueError
(
"The dataset is None and it isn't allowed."
)
if
phase
==
"train"
:
if
phase
==
"train"
:
...
@@ -1180,32 +1206,96 @@ class LACClassifyReader(BaseReader):
...
@@ -1180,32 +1206,96 @@ class LACClassifyReader(BaseReader):
self
.
vocab
[
word
]
for
word
in
processed
[
0
][
'word'
]
self
.
vocab
[
word
]
for
word
in
processed
[
0
][
'word'
]
if
word
in
self
.
vocab
if
word
in
self
.
vocab
]
]
if
len
(
processed
)
==
0
:
if
len
(
processed
)
==
0
:
if
six
.
PY2
:
if
six
.
PY2
:
text
=
text
.
encode
(
sys_stdout_encoding
())
text
=
text
.
encode
(
sys_stdout_encoding
())
logger
.
warning
(
logger
.
warning
(
"The words in text %s can't be found in the vocabulary."
%
"The words in text %s can't be found in the vocabulary."
%
(
text
))
(
text
))
return
processed
return
processed
if
not
self
.
has_processed
[
phase
]:
logger
.
info
(
"processing %s data now... this may take a few minutes"
%
phase
)
for
i
in
range
(
len
(
data
)):
if
phase
==
"predict"
:
data
[
i
]
=
preprocess
(
data
[
i
])
else
:
data
[
i
].
text_a
=
preprocess
(
data
[
i
].
text_a
)
if
self
.
label_map
:
if
data
[
i
].
label
not
in
self
.
label_map
:
raise
KeyError
(
"example.label = {%s} not in label"
%
data
[
i
].
label
)
label_id
=
self
.
label_map
[
data
[
i
].
label
]
else
:
label_id
=
data
[
i
].
label
data
[
i
].
label
=
label_id
self
.
has_processed
[
phase
]
=
True
def
_data_reader
():
def
_data_reader
():
if
shuffle
:
if
shuffle
:
np
.
random
.
shuffle
(
data
)
np
.
random
.
shuffle
(
data
)
texts
=
[]
labels
=
[]
if
phase
==
"predict"
:
if
phase
==
"predict"
:
for
text
in
data
:
for
text
in
data
:
text
=
preprocess
(
text
)
if
not
text
:
if
not
text
:
continue
continue
yield
(
text
,
)
texts
.
append
(
text
)
if
len
(
texts
)
==
batch_size
:
if
return_list
:
# for DataFeeder
# if you want to use high-performance predictor, yield [[[t] for t in texts]]
yield
[[
t
]
for
t
in
texts
]
else
:
# for DataLoader
# cannot use in high-performance predictor, as PaddleTensor rejects lod_tensor
texts
=
fluid
.
create_lod_tensor
(
texts
,
[[
len
(
seq
)
for
seq
in
texts
]],
fluid
.
CPUPlace
())
yield
[
texts
]
texts
=
[]
if
texts
:
if
return_list
:
yield
[[
t
]
for
t
in
texts
]
else
:
texts
=
fluid
.
create_lod_tensor
(
texts
,
[[
len
(
seq
)
for
seq
in
texts
]],
fluid
.
CPUPlace
())
yield
[
texts
]
texts
=
[]
else
:
else
:
for
item
in
data
:
for
item
in
data
:
text
=
preprocess
(
item
.
text_a
)
text
=
item
.
text_a
if
not
text
:
if
not
text
:
continue
continue
yield
(
text
,
item
.
label
)
texts
.
append
(
text
)
labels
.
append
([
item
.
label
])
return
paddle
.
batch
(
_data_reader
,
batch_size
=
batch_size
)
if
len
(
texts
)
==
batch_size
:
if
return_list
:
yield
list
(
zip
(
texts
,
labels
))
else
:
texts
=
fluid
.
create_lod_tensor
(
texts
,
[[
len
(
seq
)
for
seq
in
texts
]],
fluid
.
CPUPlace
())
yield
[
texts
,
labels
]
texts
=
[]
labels
=
[]
if
texts
:
if
return_list
:
yield
list
(
zip
(
texts
,
labels
))
else
:
texts
=
fluid
.
create_lod_tensor
(
texts
,
[[
len
(
seq
)
for
seq
in
texts
]],
fluid
.
CPUPlace
())
yield
[
texts
,
labels
]
texts
=
[]
labels
=
[]
return
_data_reader
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录