Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
2f44585e
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
2f44585e
编写于
6月 06, 2018
作者:
T
tangwei12
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
code optimized
上级
53409a29
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
52 addition
and
53 deletion
+52
-53
python/paddle/fluid/io.py
python/paddle/fluid/io.py
+20
-22
python/paddle/fluid/tests/unittests/test_checkpoint.py
python/paddle/fluid/tests/unittests/test_checkpoint.py
+2
-1
python/paddle/fluid/trainer.py
python/paddle/fluid/trainer.py
+30
-30
未找到文件。
python/paddle/fluid/io.py
浏览文件 @
2f44585e
...
...
@@ -476,14 +476,14 @@ def save_checkpoint(executor,
to keep numbers of checkpoint directory, the numbers of checkpoint directory are max_num_checkpoints at most,
The interval between two saved checkpoints must greater than save_interval_secs.
:param executor
:param checkpoint_dir
:param trainer_id
:param is_chief
:param main_program
:param max_num_checkpoints
"""
if
checkpoint_dir
.
strip
()
is
None
:
:param executor
executor for save the value
:param checkpoint_dir
the checkpoint directory
:param trainer_id
currect trainer id
:param is_chief
if the trainer id equals 0, the is_chief will be true
:param main_program
will save all variables in program
:param max_num_checkpoints
will keep numbers of checkpoint serials not bigger than max_num_checkpoints
"""
if
checkpoint_dir
is
None
:
raise
ValueError
(
"'checkpoint_dir' should not be None"
)
if
trainer_args
:
...
...
@@ -500,7 +500,7 @@ def save_checkpoint(executor,
if
is_chief
:
save_persist_vars_without_grad
(
executor
,
cur_dir
,
main_program
)
_
lru
_delete
(
checkpoint_dir
,
max_num_checkpoints
)
_
scroll
_delete
(
checkpoint_dir
,
max_num_checkpoints
)
def
load_checkpoint
(
executor
,
checkpoint_dir
,
serial
,
main_program
):
...
...
@@ -508,13 +508,13 @@ def load_checkpoint(executor, checkpoint_dir, serial, main_program):
Load checkpoint from a directory by executor,
it will find the most recent saved checkpoint file and load it auto.
:param executor
:param checkpoint_dir
:param serial
:param main_program
:param executor
executor for load the value
:param checkpoint_dir
the checkpoint directory
:param serial
the serial folder in checkpoint directory will be load
:param main_program
will load all variables in program
"""
if
checkpoint_dir
.
strip
()
is
None
:
if
checkpoint_dir
is
None
:
raise
ValueError
(
"'checkpoint_dir' should not be None"
)
if
serial
is
None
or
serial
<
0
:
...
...
@@ -536,9 +536,9 @@ def clean_checkpoint(checkpoint_dir, delete_dir=False):
:param delete_dir
"""
if
checkpoint_dir
.
strip
()
is
None
:
if
checkpoint_dir
is
None
:
raise
ValueError
(
"'checkpoint_dir' should not be None"
)
_
lru
_delete
(
checkpoint_dir
,
max_num_checkpoints
=
0
)
_
scroll
_delete
(
checkpoint_dir
,
max_num_checkpoints
=
0
)
if
delete_dir
and
not
os
.
listdir
(
checkpoint_dir
):
os
.
rmdir
(
checkpoint_dir
)
...
...
@@ -681,7 +681,7 @@ def _get_trainer_dir(dirname, trainer_id):
return
trainer_dir
def
_
lru
_delete
(
dirname
,
max_num_checkpoints
=
3
):
def
_
scroll
_delete
(
dirname
,
max_num_checkpoints
=
3
):
dirs
=
os
.
listdir
(
dirname
)
serial_map
=
{}
for
serial
in
dirs
:
...
...
@@ -717,7 +717,7 @@ def get_latest_checkpoint_serial(checkpoint_dir):
:param checkpoint_dir
"""
if
not
checkpoint_dir
.
strip
()
:
if
not
checkpoint_dir
:
return
-
1
def
has_success
(
checkpoint_dir
,
cur_dir
):
...
...
@@ -726,10 +726,8 @@ def get_latest_checkpoint_serial(checkpoint_dir):
"""
serial
=
_get_dir_serial
(
cur_dir
)
if
serial
==
-
1
:
return
-
1
if
not
os
.
path
.
isdir
(
os
.
path
.
join
(
checkpoint_dir
,
cur_dir
)):
if
serial
==
-
1
or
not
os
.
path
.
isdir
(
os
.
path
.
join
(
checkpoint_dir
,
cur_dir
)):
return
-
1
success_path
=
os
.
path
.
join
(
...
...
python/paddle/fluid/tests/unittests/test_checkpoint.py
浏览文件 @
2f44585e
...
...
@@ -15,11 +15,12 @@
import
paddle.fluid
as
fluid
import
unittest
import
os
import
tempfile
class
TestCheckpoint
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
dirname
=
"/tmp/ckpt"
self
.
dirname
=
tempfile
.
mktemp
()
self
.
max_num_checkpoints
=
3
self
.
epoch_interval
=
1
self
.
step_interval
=
1
...
...
python/paddle/fluid/trainer.py
浏览文件 @
2f44585e
...
...
@@ -132,19 +132,18 @@ class Trainer(object):
# 1. we need to generate a framework.Program by calling
# program_func. Reference: fluid.program_guard in
# test_word2vec.py
if
not
isinstance
(
optimizer
,
opt_module
.
Optimizer
):
raise
TypeError
(
"The optimizer should be an instance of Optimizer"
)
assert
isinstance
(
optimizer
,
opt_module
.
Optimizer
)
# config for checkpoint
# only chief worker will save variables
self
.
trainer_id
=
0
self
.
chief
=
True
self
.
checkpoint
=
checkpoint_config
if
self
.
checkpoint
:
assert
isinstance
(
self
.
checkpoint
,
CheckpointConfig
)
self
.
checkpoint
_cfg
=
checkpoint_config
if
self
.
checkpoint
_cfg
:
assert
isinstance
(
self
.
checkpoint
_cfg
,
CheckpointConfig
)
serial
=
io
.
get_latest_checkpoint_serial
(
self
.
checkpoint
.
checkpoint_dir
)
self
.
checkpoint
.
load_serial
=
serial
if
serial
>=
0
else
None
self
.
checkpoint
_cfg
.
checkpoint_dir
)
self
.
checkpoint
_cfg
.
load_serial
=
serial
if
serial
>=
0
else
None
self
.
scope
=
core
.
Scope
()
...
...
@@ -174,19 +173,20 @@ class Trainer(object):
exe
=
executor
.
Executor
(
place
)
exe
.
run
(
self
.
startup_program
)
if
self
.
checkpoint
and
self
.
checkpoint
.
load_serial
:
if
self
.
checkpoint
_cfg
and
self
.
checkpoint_cfg
.
load_serial
:
with
self
.
_prog_and_scope_guard
():
exe
=
executor
.
Executor
(
place
)
io
.
load_checkpoint
(
exe
,
self
.
checkpoint
.
checkpoint_dir
,
self
.
checkpoint
.
load_serial
,
io
.
load_checkpoint
(
exe
,
self
.
checkpoint
_cfg
.
checkpoint_dir
,
self
.
checkpoint
_cfg
.
load_serial
,
self
.
startup_program
)
if
not
self
.
checkpoint
.
is_pserver
:
if
not
self
.
checkpoint
_cfg
.
is_pserver
:
epoch_id
,
step_id
=
io
.
load_trainer_args
(
self
.
checkpoint
.
checkpoint_dir
,
self
.
checkpoint
.
load_serial
,
self
.
trainer_id
,
self
.
_get_checkpoint_load_args
())
self
.
checkpoint
.
epoch_id
=
int
(
epoch_id
)
self
.
checkpoint
.
step_id
=
int
(
step_id
)
self
.
checkpoint_cfg
.
checkpoint_dir
,
self
.
checkpoint_cfg
.
load_serial
,
self
.
trainer_id
,
self
.
_get_checkpoint_load_args
())
self
.
checkpoint_cfg
.
epoch_id
=
int
(
epoch_id
)
self
.
checkpoint_cfg
.
step_id
=
int
(
step_id
)
if
param_path
and
os
.
path
.
isdir
(
param_path
):
# load params from param_path into scope
...
...
@@ -256,7 +256,7 @@ class Trainer(object):
t
.
transpile
(
self
.
trainer_id
,
pservers
=
pserver_endpoints
,
trainers
=
trainers
)
if
training_role
==
"PSERVER"
:
if
self
.
checkpoint
:
if
self
.
checkpoint
_cfg
:
self
.
is_pserver
=
True
self
.
train_program
=
t
.
get_pserver_program
(
current_endpoint
)
...
...
@@ -351,10 +351,10 @@ class Trainer(object):
self
.
_train_by_any_executor
(
event_handler
,
exe
,
num_epochs
,
reader
)
def
_train_by_any_executor
(
self
,
event_handler
,
exe
,
num_epochs
,
reader
):
if
self
.
checkpoint
:
if
self
.
checkpoint
_cfg
:
epochs
=
[
epoch_id
for
epoch_id
in
range
(
num_epochs
)
if
epoch_id
>=
self
.
checkpoint
.
epoch_id
if
epoch_id
>=
self
.
checkpoint
_cfg
.
epoch_id
]
else
:
epochs
=
[
epoch_id
for
epoch_id
in
range
(
num_epochs
)]
...
...
@@ -366,8 +366,8 @@ class Trainer(object):
self
.
_clean_checkpoint
()
return
if
self
.
checkpoint
and
self
.
checkpoint
.
load_serial
\
and
self
.
checkpoint
.
step_id
>=
step_id
and
self
.
checkpoint
.
epoch_id
==
epoch_id
:
if
self
.
checkpoint
_cfg
and
self
.
checkpoint_cfg
.
load_serial
\
and
self
.
checkpoint
_cfg
.
step_id
>=
step_id
and
self
.
checkpoint_cfg
.
epoch_id
==
epoch_id
:
continue
begin_event
=
BeginStepEvent
(
epoch_id
,
step_id
)
...
...
@@ -381,10 +381,12 @@ class Trainer(object):
else
:
metrics
=
exe
.
run
(
feed
=
data
,
fetch_list
=
[])
self
.
_save_checkpoint
(
epoch_id
,
step_id
)
if
self
.
checkpoint_cfg
:
self
.
_save_checkpoint
(
epoch_id
,
step_id
)
event_handler
(
EndStepEvent
(
epoch_id
,
step_id
,
metrics
))
event_handler
(
EndEpochEvent
(
epoch_id
))
self
.
_clean_checkpoint
()
if
self
.
checkpoint_cfg
:
self
.
_clean_checkpoint
()
def
_test_by_executor
(
self
,
reader
,
feed_order
,
fetch_list
):
with
executor
.
scope_guard
(
self
.
scope
):
...
...
@@ -424,9 +426,8 @@ class Trainer(object):
return
self
.
_get_parallel_executor
()
def
_clean_checkpoint
(
self
):
if
not
self
.
checkpoint
:
return
io
.
clean_checkpoint
(
checkpoint_dir
=
self
.
checkpoint
.
checkpoint_dir
)
assert
self
.
checkpoint_cfg
io
.
clean_checkpoint
(
checkpoint_dir
=
self
.
checkpoint_cfg
.
checkpoint_dir
)
def
_get_checkpoint_load_args
(
self
):
"""
...
...
@@ -444,19 +445,18 @@ class Trainer(object):
return
trainer_args
def
_save_checkpoint
(
self
,
epoch_id
,
step_id
):
if
not
self
.
checkpoint
:
return
assert
self
.
checkpoint_cfg
if
epoch_id
%
self
.
checkpoint
.
epoch_interval
==
0
and
step_id
%
self
.
checkpoint
.
step_interval
==
0
:
if
epoch_id
%
self
.
checkpoint
_cfg
.
epoch_interval
==
0
and
step_id
%
self
.
checkpoint_cfg
.
step_interval
==
0
:
exe
=
executor
.
Executor
(
self
.
place
)
io
.
save_checkpoint
(
executor
=
exe
,
checkpoint_dir
=
self
.
checkpoint
.
checkpoint_dir
,
checkpoint_dir
=
self
.
checkpoint
_cfg
.
checkpoint_dir
,
trainer_id
=
self
.
trainer_id
,
is_chief
=
self
.
chief
,
trainer_args
=
self
.
_get_checkpoint_save_args
(
epoch_id
,
step_id
),
main_program
=
self
.
train_program
,
max_num_checkpoints
=
self
.
checkpoint
.
max_num_checkpoints
)
max_num_checkpoints
=
self
.
checkpoint
_cfg
.
max_num_checkpoints
)
def
build_feed_var_list
(
program
,
feed_order
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录