Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
2f44585e
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
2f44585e
编写于
6月 06, 2018
作者:
T
tangwei12
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
code optimized
上级
53409a29
变更
3
显示空白变更内容
内联
并排
Showing
3 changed file
with
52 addition
and
53 deletion
+52
-53
python/paddle/fluid/io.py
python/paddle/fluid/io.py
+20
-22
python/paddle/fluid/tests/unittests/test_checkpoint.py
python/paddle/fluid/tests/unittests/test_checkpoint.py
+2
-1
python/paddle/fluid/trainer.py
python/paddle/fluid/trainer.py
+30
-30
未找到文件。
python/paddle/fluid/io.py
浏览文件 @
2f44585e
...
...
@@ -476,14 +476,14 @@ def save_checkpoint(executor,
to keep numbers of checkpoint directory, the numbers of checkpoint directory are max_num_checkpoints at most,
The interval between two saved checkpoints must greater than save_interval_secs.
:param executor
:param checkpoint_dir
:param trainer_id
:param is_chief
:param main_program
:param max_num_checkpoints
"""
if
checkpoint_dir
.
strip
()
is
None
:
:param executor
executor for save the value
:param checkpoint_dir
the checkpoint directory
:param trainer_id
currect trainer id
:param is_chief
if the trainer id equals 0, the is_chief will be true
:param main_program
will save all variables in program
:param max_num_checkpoints
will keep numbers of checkpoint serials not bigger than max_num_checkpoints
"""
if
checkpoint_dir
is
None
:
raise
ValueError
(
"'checkpoint_dir' should not be None"
)
if
trainer_args
:
...
...
@@ -500,7 +500,7 @@ def save_checkpoint(executor,
if
is_chief
:
save_persist_vars_without_grad
(
executor
,
cur_dir
,
main_program
)
_
lru
_delete
(
checkpoint_dir
,
max_num_checkpoints
)
_
scroll
_delete
(
checkpoint_dir
,
max_num_checkpoints
)
def
load_checkpoint
(
executor
,
checkpoint_dir
,
serial
,
main_program
):
...
...
@@ -508,13 +508,13 @@ def load_checkpoint(executor, checkpoint_dir, serial, main_program):
Load checkpoint from a directory by executor,
it will find the most recent saved checkpoint file and load it auto.
:param executor
:param checkpoint_dir
:param serial
:param main_program
:param executor
executor for load the value
:param checkpoint_dir
the checkpoint directory
:param serial
the serial folder in checkpoint directory will be load
:param main_program
will load all variables in program
"""
if
checkpoint_dir
.
strip
()
is
None
:
if
checkpoint_dir
is
None
:
raise
ValueError
(
"'checkpoint_dir' should not be None"
)
if
serial
is
None
or
serial
<
0
:
...
...
@@ -536,9 +536,9 @@ def clean_checkpoint(checkpoint_dir, delete_dir=False):
:param delete_dir
"""
if
checkpoint_dir
.
strip
()
is
None
:
if
checkpoint_dir
is
None
:
raise
ValueError
(
"'checkpoint_dir' should not be None"
)
_
lru
_delete
(
checkpoint_dir
,
max_num_checkpoints
=
0
)
_
scroll
_delete
(
checkpoint_dir
,
max_num_checkpoints
=
0
)
if
delete_dir
and
not
os
.
listdir
(
checkpoint_dir
):
os
.
rmdir
(
checkpoint_dir
)
...
...
@@ -681,7 +681,7 @@ def _get_trainer_dir(dirname, trainer_id):
return
trainer_dir
def
_
lru
_delete
(
dirname
,
max_num_checkpoints
=
3
):
def
_
scroll
_delete
(
dirname
,
max_num_checkpoints
=
3
):
dirs
=
os
.
listdir
(
dirname
)
serial_map
=
{}
for
serial
in
dirs
:
...
...
@@ -717,7 +717,7 @@ def get_latest_checkpoint_serial(checkpoint_dir):
:param checkpoint_dir
"""
if
not
checkpoint_dir
.
strip
()
:
if
not
checkpoint_dir
:
return
-
1
def
has_success
(
checkpoint_dir
,
cur_dir
):
...
...
@@ -726,10 +726,8 @@ def get_latest_checkpoint_serial(checkpoint_dir):
"""
serial
=
_get_dir_serial
(
cur_dir
)
if
serial
==
-
1
:
return
-
1
if
not
os
.
path
.
isdir
(
os
.
path
.
join
(
checkpoint_dir
,
cur_dir
)):
if
serial
==
-
1
or
not
os
.
path
.
isdir
(
os
.
path
.
join
(
checkpoint_dir
,
cur_dir
)):
return
-
1
success_path
=
os
.
path
.
join
(
...
...
python/paddle/fluid/tests/unittests/test_checkpoint.py
浏览文件 @
2f44585e
...
...
@@ -15,11 +15,12 @@
import
paddle.fluid
as
fluid
import
unittest
import
os
import
tempfile
class
TestCheckpoint
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
dirname
=
"/tmp/ckpt"
self
.
dirname
=
tempfile
.
mktemp
()
self
.
max_num_checkpoints
=
3
self
.
epoch_interval
=
1
self
.
step_interval
=
1
...
...
python/paddle/fluid/trainer.py
浏览文件 @
2f44585e
...
...
@@ -132,19 +132,18 @@ class Trainer(object):
# 1. we need to generate a framework.Program by calling
# program_func. Reference: fluid.program_guard in
# test_word2vec.py
if
not
isinstance
(
optimizer
,
opt_module
.
Optimizer
):
raise
TypeError
(
"The optimizer should be an instance of Optimizer"
)
assert
isinstance
(
optimizer
,
opt_module
.
Optimizer
)
# config for checkpoint
# only chief worker will save variables
self
.
trainer_id
=
0
self
.
chief
=
True
self
.
checkpoint
=
checkpoint_config
if
self
.
checkpoint
:
assert
isinstance
(
self
.
checkpoint
,
CheckpointConfig
)
self
.
checkpoint
_cfg
=
checkpoint_config
if
self
.
checkpoint
_cfg
:
assert
isinstance
(
self
.
checkpoint
_cfg
,
CheckpointConfig
)
serial
=
io
.
get_latest_checkpoint_serial
(
self
.
checkpoint
.
checkpoint_dir
)
self
.
checkpoint
.
load_serial
=
serial
if
serial
>=
0
else
None
self
.
checkpoint
_cfg
.
checkpoint_dir
)
self
.
checkpoint
_cfg
.
load_serial
=
serial
if
serial
>=
0
else
None
self
.
scope
=
core
.
Scope
()
...
...
@@ -174,19 +173,20 @@ class Trainer(object):
exe
=
executor
.
Executor
(
place
)
exe
.
run
(
self
.
startup_program
)
if
self
.
checkpoint
and
self
.
checkpoint
.
load_serial
:
if
self
.
checkpoint
_cfg
and
self
.
checkpoint_cfg
.
load_serial
:
with
self
.
_prog_and_scope_guard
():
exe
=
executor
.
Executor
(
place
)
io
.
load_checkpoint
(
exe
,
self
.
checkpoint
.
checkpoint_dir
,
self
.
checkpoint
.
load_serial
,
io
.
load_checkpoint
(
exe
,
self
.
checkpoint
_cfg
.
checkpoint_dir
,
self
.
checkpoint
_cfg
.
load_serial
,
self
.
startup_program
)
if
not
self
.
checkpoint
.
is_pserver
:
if
not
self
.
checkpoint
_cfg
.
is_pserver
:
epoch_id
,
step_id
=
io
.
load_trainer_args
(
self
.
checkpoint
.
checkpoint_dir
,
self
.
checkpoint
.
load_serial
,
self
.
trainer_id
,
self
.
_get_checkpoint_load_args
())
self
.
checkpoint
.
epoch_id
=
int
(
epoch_id
)
self
.
checkpoint
.
step_id
=
int
(
step_id
)
self
.
checkpoint_cfg
.
checkpoint_dir
,
self
.
checkpoint_cfg
.
load_serial
,
self
.
trainer_id
,
self
.
_get_checkpoint_load_args
())
self
.
checkpoint_cfg
.
epoch_id
=
int
(
epoch_id
)
self
.
checkpoint_cfg
.
step_id
=
int
(
step_id
)
if
param_path
and
os
.
path
.
isdir
(
param_path
):
# load params from param_path into scope
...
...
@@ -256,7 +256,7 @@ class Trainer(object):
t
.
transpile
(
self
.
trainer_id
,
pservers
=
pserver_endpoints
,
trainers
=
trainers
)
if
training_role
==
"PSERVER"
:
if
self
.
checkpoint
:
if
self
.
checkpoint
_cfg
:
self
.
is_pserver
=
True
self
.
train_program
=
t
.
get_pserver_program
(
current_endpoint
)
...
...
@@ -351,10 +351,10 @@ class Trainer(object):
self
.
_train_by_any_executor
(
event_handler
,
exe
,
num_epochs
,
reader
)
def
_train_by_any_executor
(
self
,
event_handler
,
exe
,
num_epochs
,
reader
):
if
self
.
checkpoint
:
if
self
.
checkpoint
_cfg
:
epochs
=
[
epoch_id
for
epoch_id
in
range
(
num_epochs
)
if
epoch_id
>=
self
.
checkpoint
.
epoch_id
if
epoch_id
>=
self
.
checkpoint
_cfg
.
epoch_id
]
else
:
epochs
=
[
epoch_id
for
epoch_id
in
range
(
num_epochs
)]
...
...
@@ -366,8 +366,8 @@ class Trainer(object):
self
.
_clean_checkpoint
()
return
if
self
.
checkpoint
and
self
.
checkpoint
.
load_serial
\
and
self
.
checkpoint
.
step_id
>=
step_id
and
self
.
checkpoint
.
epoch_id
==
epoch_id
:
if
self
.
checkpoint
_cfg
and
self
.
checkpoint_cfg
.
load_serial
\
and
self
.
checkpoint
_cfg
.
step_id
>=
step_id
and
self
.
checkpoint_cfg
.
epoch_id
==
epoch_id
:
continue
begin_event
=
BeginStepEvent
(
epoch_id
,
step_id
)
...
...
@@ -381,9 +381,11 @@ class Trainer(object):
else
:
metrics
=
exe
.
run
(
feed
=
data
,
fetch_list
=
[])
if
self
.
checkpoint_cfg
:
self
.
_save_checkpoint
(
epoch_id
,
step_id
)
event_handler
(
EndStepEvent
(
epoch_id
,
step_id
,
metrics
))
event_handler
(
EndEpochEvent
(
epoch_id
))
if
self
.
checkpoint_cfg
:
self
.
_clean_checkpoint
()
def
_test_by_executor
(
self
,
reader
,
feed_order
,
fetch_list
):
...
...
@@ -424,9 +426,8 @@ class Trainer(object):
return
self
.
_get_parallel_executor
()
def
_clean_checkpoint
(
self
):
if
not
self
.
checkpoint
:
return
io
.
clean_checkpoint
(
checkpoint_dir
=
self
.
checkpoint
.
checkpoint_dir
)
assert
self
.
checkpoint_cfg
io
.
clean_checkpoint
(
checkpoint_dir
=
self
.
checkpoint_cfg
.
checkpoint_dir
)
def
_get_checkpoint_load_args
(
self
):
"""
...
...
@@ -444,19 +445,18 @@ class Trainer(object):
return
trainer_args
def
_save_checkpoint
(
self
,
epoch_id
,
step_id
):
if
not
self
.
checkpoint
:
return
assert
self
.
checkpoint_cfg
if
epoch_id
%
self
.
checkpoint
.
epoch_interval
==
0
and
step_id
%
self
.
checkpoint
.
step_interval
==
0
:
if
epoch_id
%
self
.
checkpoint
_cfg
.
epoch_interval
==
0
and
step_id
%
self
.
checkpoint_cfg
.
step_interval
==
0
:
exe
=
executor
.
Executor
(
self
.
place
)
io
.
save_checkpoint
(
executor
=
exe
,
checkpoint_dir
=
self
.
checkpoint
.
checkpoint_dir
,
checkpoint_dir
=
self
.
checkpoint
_cfg
.
checkpoint_dir
,
trainer_id
=
self
.
trainer_id
,
is_chief
=
self
.
chief
,
trainer_args
=
self
.
_get_checkpoint_save_args
(
epoch_id
,
step_id
),
main_program
=
self
.
train_program
,
max_num_checkpoints
=
self
.
checkpoint
.
max_num_checkpoints
)
max_num_checkpoints
=
self
.
checkpoint
_cfg
.
max_num_checkpoints
)
def
build_feed_var_list
(
program
,
feed_order
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录