Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleDetection
提交
2f44585e
P
PaddleDetection
项目概览
PaddlePaddle
/
PaddleDetection
大约 2 年 前同步成功
通知
708
Star
11112
Fork
2696
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
184
列表
看板
标记
里程碑
合并请求
40
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleDetection
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
184
Issue
184
列表
看板
标记
里程碑
合并请求
40
合并请求
40
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
2f44585e
编写于
6月 06, 2018
作者:
T
tangwei12
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
code optimized
上级
53409a29
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
52 addition
and
53 deletion
+52
-53
python/paddle/fluid/io.py
python/paddle/fluid/io.py
+20
-22
python/paddle/fluid/tests/unittests/test_checkpoint.py
python/paddle/fluid/tests/unittests/test_checkpoint.py
+2
-1
python/paddle/fluid/trainer.py
python/paddle/fluid/trainer.py
+30
-30
未找到文件。
python/paddle/fluid/io.py
浏览文件 @
2f44585e
...
@@ -476,14 +476,14 @@ def save_checkpoint(executor,
...
@@ -476,14 +476,14 @@ def save_checkpoint(executor,
to keep numbers of checkpoint directory, the numbers of checkpoint directory are max_num_checkpoints at most,
to keep numbers of checkpoint directory, the numbers of checkpoint directory are max_num_checkpoints at most,
The interval between two saved checkpoints must greater than save_interval_secs.
The interval between two saved checkpoints must greater than save_interval_secs.
:param executor
:param executor
executor for save the value
:param checkpoint_dir
:param checkpoint_dir
the checkpoint directory
:param trainer_id
:param trainer_id
currect trainer id
:param is_chief
:param is_chief
if the trainer id equals 0, the is_chief will be true
:param main_program
:param main_program
will save all variables in program
:param max_num_checkpoints
:param max_num_checkpoints
will keep numbers of checkpoint serials not bigger than max_num_checkpoints
"""
"""
if
checkpoint_dir
.
strip
()
is
None
:
if
checkpoint_dir
is
None
:
raise
ValueError
(
"'checkpoint_dir' should not be None"
)
raise
ValueError
(
"'checkpoint_dir' should not be None"
)
if
trainer_args
:
if
trainer_args
:
...
@@ -500,7 +500,7 @@ def save_checkpoint(executor,
...
@@ -500,7 +500,7 @@ def save_checkpoint(executor,
if
is_chief
:
if
is_chief
:
save_persist_vars_without_grad
(
executor
,
cur_dir
,
main_program
)
save_persist_vars_without_grad
(
executor
,
cur_dir
,
main_program
)
_
lru
_delete
(
checkpoint_dir
,
max_num_checkpoints
)
_
scroll
_delete
(
checkpoint_dir
,
max_num_checkpoints
)
def
load_checkpoint
(
executor
,
checkpoint_dir
,
serial
,
main_program
):
def
load_checkpoint
(
executor
,
checkpoint_dir
,
serial
,
main_program
):
...
@@ -508,13 +508,13 @@ def load_checkpoint(executor, checkpoint_dir, serial, main_program):
...
@@ -508,13 +508,13 @@ def load_checkpoint(executor, checkpoint_dir, serial, main_program):
Load checkpoint from a directory by executor,
Load checkpoint from a directory by executor,
it will find the most recent saved checkpoint file and load it auto.
it will find the most recent saved checkpoint file and load it auto.
:param executor
:param executor
executor for load the value
:param checkpoint_dir
:param checkpoint_dir
the checkpoint directory
:param serial
:param serial
the serial folder in checkpoint directory will be load
:param main_program
:param main_program
will load all variables in program
"""
"""
if
checkpoint_dir
.
strip
()
is
None
:
if
checkpoint_dir
is
None
:
raise
ValueError
(
"'checkpoint_dir' should not be None"
)
raise
ValueError
(
"'checkpoint_dir' should not be None"
)
if
serial
is
None
or
serial
<
0
:
if
serial
is
None
or
serial
<
0
:
...
@@ -536,9 +536,9 @@ def clean_checkpoint(checkpoint_dir, delete_dir=False):
...
@@ -536,9 +536,9 @@ def clean_checkpoint(checkpoint_dir, delete_dir=False):
:param delete_dir
:param delete_dir
"""
"""
if
checkpoint_dir
.
strip
()
is
None
:
if
checkpoint_dir
is
None
:
raise
ValueError
(
"'checkpoint_dir' should not be None"
)
raise
ValueError
(
"'checkpoint_dir' should not be None"
)
_
lru
_delete
(
checkpoint_dir
,
max_num_checkpoints
=
0
)
_
scroll
_delete
(
checkpoint_dir
,
max_num_checkpoints
=
0
)
if
delete_dir
and
not
os
.
listdir
(
checkpoint_dir
):
if
delete_dir
and
not
os
.
listdir
(
checkpoint_dir
):
os
.
rmdir
(
checkpoint_dir
)
os
.
rmdir
(
checkpoint_dir
)
...
@@ -681,7 +681,7 @@ def _get_trainer_dir(dirname, trainer_id):
...
@@ -681,7 +681,7 @@ def _get_trainer_dir(dirname, trainer_id):
return
trainer_dir
return
trainer_dir
def
_
lru
_delete
(
dirname
,
max_num_checkpoints
=
3
):
def
_
scroll
_delete
(
dirname
,
max_num_checkpoints
=
3
):
dirs
=
os
.
listdir
(
dirname
)
dirs
=
os
.
listdir
(
dirname
)
serial_map
=
{}
serial_map
=
{}
for
serial
in
dirs
:
for
serial
in
dirs
:
...
@@ -717,7 +717,7 @@ def get_latest_checkpoint_serial(checkpoint_dir):
...
@@ -717,7 +717,7 @@ def get_latest_checkpoint_serial(checkpoint_dir):
:param checkpoint_dir
:param checkpoint_dir
"""
"""
if
not
checkpoint_dir
.
strip
()
:
if
not
checkpoint_dir
:
return
-
1
return
-
1
def
has_success
(
checkpoint_dir
,
cur_dir
):
def
has_success
(
checkpoint_dir
,
cur_dir
):
...
@@ -726,10 +726,8 @@ def get_latest_checkpoint_serial(checkpoint_dir):
...
@@ -726,10 +726,8 @@ def get_latest_checkpoint_serial(checkpoint_dir):
"""
"""
serial
=
_get_dir_serial
(
cur_dir
)
serial
=
_get_dir_serial
(
cur_dir
)
if
serial
==
-
1
:
if
serial
==
-
1
or
not
os
.
path
.
isdir
(
return
-
1
os
.
path
.
join
(
checkpoint_dir
,
cur_dir
)):
if
not
os
.
path
.
isdir
(
os
.
path
.
join
(
checkpoint_dir
,
cur_dir
)):
return
-
1
return
-
1
success_path
=
os
.
path
.
join
(
success_path
=
os
.
path
.
join
(
...
...
python/paddle/fluid/tests/unittests/test_checkpoint.py
浏览文件 @
2f44585e
...
@@ -15,11 +15,12 @@
...
@@ -15,11 +15,12 @@
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
import
unittest
import
unittest
import
os
import
os
import
tempfile
class
TestCheckpoint
(
unittest
.
TestCase
):
class
TestCheckpoint
(
unittest
.
TestCase
):
def
setUp
(
self
):
def
setUp
(
self
):
self
.
dirname
=
"/tmp/ckpt"
self
.
dirname
=
tempfile
.
mktemp
()
self
.
max_num_checkpoints
=
3
self
.
max_num_checkpoints
=
3
self
.
epoch_interval
=
1
self
.
epoch_interval
=
1
self
.
step_interval
=
1
self
.
step_interval
=
1
...
...
python/paddle/fluid/trainer.py
浏览文件 @
2f44585e
...
@@ -132,19 +132,18 @@ class Trainer(object):
...
@@ -132,19 +132,18 @@ class Trainer(object):
# 1. we need to generate a framework.Program by calling
# 1. we need to generate a framework.Program by calling
# program_func. Reference: fluid.program_guard in
# program_func. Reference: fluid.program_guard in
# test_word2vec.py
# test_word2vec.py
if
not
isinstance
(
optimizer
,
opt_module
.
Optimizer
):
assert
isinstance
(
optimizer
,
opt_module
.
Optimizer
)
raise
TypeError
(
"The optimizer should be an instance of Optimizer"
)
# config for checkpoint
# config for checkpoint
# only chief worker will save variables
# only chief worker will save variables
self
.
trainer_id
=
0
self
.
trainer_id
=
0
self
.
chief
=
True
self
.
chief
=
True
self
.
checkpoint
=
checkpoint_config
self
.
checkpoint
_cfg
=
checkpoint_config
if
self
.
checkpoint
:
if
self
.
checkpoint
_cfg
:
assert
isinstance
(
self
.
checkpoint
,
CheckpointConfig
)
assert
isinstance
(
self
.
checkpoint
_cfg
,
CheckpointConfig
)
serial
=
io
.
get_latest_checkpoint_serial
(
serial
=
io
.
get_latest_checkpoint_serial
(
self
.
checkpoint
.
checkpoint_dir
)
self
.
checkpoint
_cfg
.
checkpoint_dir
)
self
.
checkpoint
.
load_serial
=
serial
if
serial
>=
0
else
None
self
.
checkpoint
_cfg
.
load_serial
=
serial
if
serial
>=
0
else
None
self
.
scope
=
core
.
Scope
()
self
.
scope
=
core
.
Scope
()
...
@@ -174,19 +173,20 @@ class Trainer(object):
...
@@ -174,19 +173,20 @@ class Trainer(object):
exe
=
executor
.
Executor
(
place
)
exe
=
executor
.
Executor
(
place
)
exe
.
run
(
self
.
startup_program
)
exe
.
run
(
self
.
startup_program
)
if
self
.
checkpoint
and
self
.
checkpoint
.
load_serial
:
if
self
.
checkpoint
_cfg
and
self
.
checkpoint_cfg
.
load_serial
:
with
self
.
_prog_and_scope_guard
():
with
self
.
_prog_and_scope_guard
():
exe
=
executor
.
Executor
(
place
)
exe
=
executor
.
Executor
(
place
)
io
.
load_checkpoint
(
exe
,
self
.
checkpoint
.
checkpoint_dir
,
io
.
load_checkpoint
(
exe
,
self
.
checkpoint
_cfg
.
checkpoint_dir
,
self
.
checkpoint
.
load_serial
,
self
.
checkpoint
_cfg
.
load_serial
,
self
.
startup_program
)
self
.
startup_program
)
if
not
self
.
checkpoint
.
is_pserver
:
if
not
self
.
checkpoint
_cfg
.
is_pserver
:
epoch_id
,
step_id
=
io
.
load_trainer_args
(
epoch_id
,
step_id
=
io
.
load_trainer_args
(
self
.
checkpoint
.
checkpoint_dir
,
self
.
checkpoint
.
load_serial
,
self
.
checkpoint_cfg
.
checkpoint_dir
,
self
.
trainer_id
,
self
.
_get_checkpoint_load_args
())
self
.
checkpoint_cfg
.
load_serial
,
self
.
trainer_id
,
self
.
checkpoint
.
epoch_id
=
int
(
epoch_id
)
self
.
_get_checkpoint_load_args
())
self
.
checkpoint
.
step_id
=
int
(
step_id
)
self
.
checkpoint_cfg
.
epoch_id
=
int
(
epoch_id
)
self
.
checkpoint_cfg
.
step_id
=
int
(
step_id
)
if
param_path
and
os
.
path
.
isdir
(
param_path
):
if
param_path
and
os
.
path
.
isdir
(
param_path
):
# load params from param_path into scope
# load params from param_path into scope
...
@@ -256,7 +256,7 @@ class Trainer(object):
...
@@ -256,7 +256,7 @@ class Trainer(object):
t
.
transpile
(
t
.
transpile
(
self
.
trainer_id
,
pservers
=
pserver_endpoints
,
trainers
=
trainers
)
self
.
trainer_id
,
pservers
=
pserver_endpoints
,
trainers
=
trainers
)
if
training_role
==
"PSERVER"
:
if
training_role
==
"PSERVER"
:
if
self
.
checkpoint
:
if
self
.
checkpoint
_cfg
:
self
.
is_pserver
=
True
self
.
is_pserver
=
True
self
.
train_program
=
t
.
get_pserver_program
(
current_endpoint
)
self
.
train_program
=
t
.
get_pserver_program
(
current_endpoint
)
...
@@ -351,10 +351,10 @@ class Trainer(object):
...
@@ -351,10 +351,10 @@ class Trainer(object):
self
.
_train_by_any_executor
(
event_handler
,
exe
,
num_epochs
,
reader
)
self
.
_train_by_any_executor
(
event_handler
,
exe
,
num_epochs
,
reader
)
def
_train_by_any_executor
(
self
,
event_handler
,
exe
,
num_epochs
,
reader
):
def
_train_by_any_executor
(
self
,
event_handler
,
exe
,
num_epochs
,
reader
):
if
self
.
checkpoint
:
if
self
.
checkpoint
_cfg
:
epochs
=
[
epochs
=
[
epoch_id
for
epoch_id
in
range
(
num_epochs
)
epoch_id
for
epoch_id
in
range
(
num_epochs
)
if
epoch_id
>=
self
.
checkpoint
.
epoch_id
if
epoch_id
>=
self
.
checkpoint
_cfg
.
epoch_id
]
]
else
:
else
:
epochs
=
[
epoch_id
for
epoch_id
in
range
(
num_epochs
)]
epochs
=
[
epoch_id
for
epoch_id
in
range
(
num_epochs
)]
...
@@ -366,8 +366,8 @@ class Trainer(object):
...
@@ -366,8 +366,8 @@ class Trainer(object):
self
.
_clean_checkpoint
()
self
.
_clean_checkpoint
()
return
return
if
self
.
checkpoint
and
self
.
checkpoint
.
load_serial
\
if
self
.
checkpoint
_cfg
and
self
.
checkpoint_cfg
.
load_serial
\
and
self
.
checkpoint
.
step_id
>=
step_id
and
self
.
checkpoint
.
epoch_id
==
epoch_id
:
and
self
.
checkpoint
_cfg
.
step_id
>=
step_id
and
self
.
checkpoint_cfg
.
epoch_id
==
epoch_id
:
continue
continue
begin_event
=
BeginStepEvent
(
epoch_id
,
step_id
)
begin_event
=
BeginStepEvent
(
epoch_id
,
step_id
)
...
@@ -381,10 +381,12 @@ class Trainer(object):
...
@@ -381,10 +381,12 @@ class Trainer(object):
else
:
else
:
metrics
=
exe
.
run
(
feed
=
data
,
fetch_list
=
[])
metrics
=
exe
.
run
(
feed
=
data
,
fetch_list
=
[])
self
.
_save_checkpoint
(
epoch_id
,
step_id
)
if
self
.
checkpoint_cfg
:
self
.
_save_checkpoint
(
epoch_id
,
step_id
)
event_handler
(
EndStepEvent
(
epoch_id
,
step_id
,
metrics
))
event_handler
(
EndStepEvent
(
epoch_id
,
step_id
,
metrics
))
event_handler
(
EndEpochEvent
(
epoch_id
))
event_handler
(
EndEpochEvent
(
epoch_id
))
self
.
_clean_checkpoint
()
if
self
.
checkpoint_cfg
:
self
.
_clean_checkpoint
()
def
_test_by_executor
(
self
,
reader
,
feed_order
,
fetch_list
):
def
_test_by_executor
(
self
,
reader
,
feed_order
,
fetch_list
):
with
executor
.
scope_guard
(
self
.
scope
):
with
executor
.
scope_guard
(
self
.
scope
):
...
@@ -424,9 +426,8 @@ class Trainer(object):
...
@@ -424,9 +426,8 @@ class Trainer(object):
return
self
.
_get_parallel_executor
()
return
self
.
_get_parallel_executor
()
def
_clean_checkpoint
(
self
):
def
_clean_checkpoint
(
self
):
if
not
self
.
checkpoint
:
assert
self
.
checkpoint_cfg
return
io
.
clean_checkpoint
(
checkpoint_dir
=
self
.
checkpoint_cfg
.
checkpoint_dir
)
io
.
clean_checkpoint
(
checkpoint_dir
=
self
.
checkpoint
.
checkpoint_dir
)
def
_get_checkpoint_load_args
(
self
):
def
_get_checkpoint_load_args
(
self
):
"""
"""
...
@@ -444,19 +445,18 @@ class Trainer(object):
...
@@ -444,19 +445,18 @@ class Trainer(object):
return
trainer_args
return
trainer_args
def
_save_checkpoint
(
self
,
epoch_id
,
step_id
):
def
_save_checkpoint
(
self
,
epoch_id
,
step_id
):
if
not
self
.
checkpoint
:
assert
self
.
checkpoint_cfg
return
if
epoch_id
%
self
.
checkpoint
.
epoch_interval
==
0
and
step_id
%
self
.
checkpoint
.
step_interval
==
0
:
if
epoch_id
%
self
.
checkpoint
_cfg
.
epoch_interval
==
0
and
step_id
%
self
.
checkpoint_cfg
.
step_interval
==
0
:
exe
=
executor
.
Executor
(
self
.
place
)
exe
=
executor
.
Executor
(
self
.
place
)
io
.
save_checkpoint
(
io
.
save_checkpoint
(
executor
=
exe
,
executor
=
exe
,
checkpoint_dir
=
self
.
checkpoint
.
checkpoint_dir
,
checkpoint_dir
=
self
.
checkpoint
_cfg
.
checkpoint_dir
,
trainer_id
=
self
.
trainer_id
,
trainer_id
=
self
.
trainer_id
,
is_chief
=
self
.
chief
,
is_chief
=
self
.
chief
,
trainer_args
=
self
.
_get_checkpoint_save_args
(
epoch_id
,
step_id
),
trainer_args
=
self
.
_get_checkpoint_save_args
(
epoch_id
,
step_id
),
main_program
=
self
.
train_program
,
main_program
=
self
.
train_program
,
max_num_checkpoints
=
self
.
checkpoint
.
max_num_checkpoints
)
max_num_checkpoints
=
self
.
checkpoint
_cfg
.
max_num_checkpoints
)
def
build_feed_var_list
(
program
,
feed_order
):
def
build_feed_var_list
(
program
,
feed_order
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录