Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
5eea5db9
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
5eea5db9
编写于
5月 29, 2018
作者:
T
tangwei12
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
optimized checkpoint and save_model
上级
514b2427
变更
3
显示空白变更内容
内联
并排
Showing
3 changed file
with
58 addition
and
44 deletion
+58
-44
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+1
-0
python/paddle/fluid/io.py
python/paddle/fluid/io.py
+28
-33
python/paddle/fluid/trainer.py
python/paddle/fluid/trainer.py
+29
-11
未找到文件。
python/paddle/fluid/__init__.py
浏览文件 @
5eea5db9
...
...
@@ -26,6 +26,7 @@ from trainer import BeginEpochEvent
from
trainer
import
EndEpochEvent
from
trainer
import
BeginStepEvent
from
trainer
import
EndStepEvent
from
trainer
import
CheckpointConfig
import
inferencer
from
inferencer
import
Inferencer
...
...
python/paddle/fluid/io.py
浏览文件 @
5eea5db9
...
...
@@ -491,7 +491,6 @@ CHECKPOINT_SEPARATOR = "_"
def
save_checkpoint
(
executor
,
checkpoint_dir
=
None
,
max_num_checkpoints
=
3
,
save_interval_secs
=
600
,
main_program
=
None
):
"""
Save Checkpoint will save persistable LodTensor variables from main_program in checkpoint directory,
...
...
@@ -511,15 +510,10 @@ def save_checkpoint(executor,
if
not
os
.
path
.
isdir
(
checkpoint_dir
):
os
.
makedirs
(
checkpoint_dir
)
serial
=
_get_lastest_checkpoint_dir
(
checkpoint_dir
)
if
serial
>=
0
and
not
_interval_secs_exceed
(
_get_serial_dir
(
serial
,
checkpoint_dir
),
save_interval_secs
):
return
serial
+=
1
cur_dir
=
_get_serial_dir
(
serial
,
checkpoint_dir
)
serial
=
_get_lastest_checkpoint_dir
(
checkpoint_dir
)
+
1
cur_dir
=
_get_serial_dir
(
checkpoint_dir
,
serial
)
load
_persist_vars_without_grad
(
executor
,
cur_dir
,
main_program
)
save
_persist_vars_without_grad
(
executor
,
cur_dir
,
main_program
)
_write_success
(
cur_dir
)
_lru_delete
(
checkpoint_dir
,
max_num_checkpoints
)
...
...
@@ -542,7 +536,7 @@ def load_checkpoint(executor, checkpoint_dir=None, main_program=None):
if
serial
<
0
:
return
cur_dir
=
_get_serial_dir
(
serial
,
checkpoint_dir
)
cur_dir
=
_get_serial_dir
(
checkpoint_dir
,
serial
)
load_persist_vars_without_grad
(
executor
,
cur_dir
,
main_program
)
...
...
@@ -559,11 +553,6 @@ def clean_checkpoint(checkpoint_dir, delete_dir=False):
os
.
rmdir
(
checkpoint_dir
)
def
_get_serial_dir
(
serial
,
checkpoint_dir
):
serial_folder
=
CHECKPOINT_PREFIX
+
CHECKPOINT_SEPARATOR
+
str
(
serial
)
return
os
.
path
.
join
(
checkpoint_dir
,
serial_folder
)
def
_is_checkpoint_var
(
var
):
"""
the checkpoint will not save or load all the variables.
...
...
@@ -582,29 +571,37 @@ def _is_checkpoint_var(var):
return
var
.
persistable
def
_interval_secs_exceed
(
dirname
,
save_interval_secs
):
dir_time
=
os
.
path
.
getmtime
(
dirname
)
if
save_interval_secs
>
(
time
.
time
()
-
dir_time
):
return
False
return
True
def
_get_dir_serial
(
dirname
):
_
,
serial
=
dirname
.
split
(
CHECKPOINT_SEPARATOR
)
serial_num
=
-
1
try
:
serial_num
=
int
(
serial
)
except
ValueError
:
serial_num
=
-
1
return
serial_num
def
_get_serial_dir
(
dirname
,
serial
):
serial_folder
=
CHECKPOINT_PREFIX
+
CHECKPOINT_SEPARATOR
+
str
(
serial
)
return
os
.
path
.
join
(
dirname
,
serial_folder
)
def
_lru_delete
(
dirname
,
max_num_checkpoints
=
3
):
dirs
=
os
.
listdir
(
dirname
)
serial
s
=
[]
serial
_map
=
{}
for
serial
in
dirs
:
try
:
serials
.
append
(
int
(
serial
))
except
ValueError
:
continue
serial_num
=
_get_dir_serial
(
serial
)
serial_map
[
serial_num
]
=
serial
if
len
(
serial
s
)
<=
max_num_checkpoints
:
if
len
(
serial
_map
.
keys
()
)
<=
max_num_checkpoints
:
return
serials
=
serial_map
.
keys
()
serials
.
sort
(
reverse
=
True
)
serials
=
serials
[
max_num_checkpoints
:]
for
serial
in
serials
:
cur_dir
=
os
.
path
.
join
(
dirname
,
str
(
serial
)
)
cur_dir
=
_get_serial_dir
(
dirname
,
serial
)
shutil
.
rmtree
(
cur_dir
)
...
...
@@ -633,20 +630,18 @@ def _get_lastest_checkpoint_dir(checkpoint_dir):
"""
is _SUCCESS in this dir
"""
_
,
serial
=
cur_dir
.
split
(
CHECKPOINT_SEPARATOR
)
try
:
int
(
serial
)
except
ValueError
:
serial
=
_get_dir_serial
(
cur_dir
)
if
serial
==
-
1
:
return
-
1
if
not
os
.
path
.
isdir
(
os
.
path
.
join
(
checkpoint_dir
,
cur_dir
)):
return
-
1
success_path
=
os
.
path
.
join
(
_get_serial_dir
(
serial
,
checkpoint_dir
),
SUCCESS_MARK_FILENAME
)
_get_serial_dir
(
checkpoint_dir
,
serial
),
SUCCESS_MARK_FILENAME
)
if
os
.
path
.
isfile
(
success_path
):
return
int
(
serial
)
return
serial
if
not
os
.
path
.
isdir
(
checkpoint_dir
):
return
-
1
...
...
python/paddle/fluid/trainer.py
浏览文件 @
5eea5db9
...
...
@@ -60,11 +60,24 @@ class CheckpointConfig(object):
def
__init__
(
self
,
checkpoint_dir
=
None
,
max_num_checkpoints
=
3
,
save_interval_secs
=
600
):
epoch_interval
=
1
,
step_interval
=
10
):
if
checkpoint_dir
is
None
:
self
.
checkpoint_dir
=
os
.
getcwd
()
else
:
self
.
checkpoint_dir
=
checkpoint_dir
self
.
max_num_checkpoints
=
max_num_checkpoints
self
.
save_interval_secs
=
save_interval_secs
if
epoch_interval
<
1
:
self
.
epoch_interval
=
1
else
:
self
.
epoch_interval
=
epoch_interval
if
step_interval
<
1
:
self
.
step_interval
=
10
else
:
self
.
step_interval
=
step_interval
def
check_and_get_place
(
place
):
...
...
@@ -290,14 +303,6 @@ class Trainer(object):
exe
=
executor
.
Executor
(
self
.
place
)
io
.
save_persistables
(
exe
,
dirname
=
param_path
)
def
_save_checkpoint
(
self
):
if
self
.
checkpoint
and
self
.
chief
:
exe
=
executor
.
Executor
(
self
.
place
)
io
.
save_checkpoint
(
exe
,
self
.
checkpoint
.
checkpoint_dir
,
self
.
checkpoint
.
max_num_checkpoints
,
self
.
checkpoint
.
save_interval_secs
,
self
.
train_program
)
@
contextlib
.
contextmanager
def
_prog_and_scope_guard
(
self
):
with
framework
.
program_guard
(
...
...
@@ -343,8 +348,9 @@ class Trainer(object):
])
else
:
metrics
=
exe
.
run
(
feed
=
data
,
fetch_list
=
[])
event_handler
(
EndStepEvent
(
epoch_id
,
step_id
,
metrics
))
self
.
_save_checkpoint
()
self
.
_save_checkpoint
(
epoch_id
,
step_id
)
event_handler
(
EndEpochEvent
(
epoch_id
))
def
_test_by_executor
(
self
,
reader
,
feed_order
,
fetch_list
):
...
...
@@ -384,6 +390,18 @@ class Trainer(object):
loss_name
=
self
.
train_func_outputs
[
0
].
name
)
return
self
.
_get_parallel_executor
()
def
_save_checkpoint
(
self
,
epoch_id
,
step_id
):
if
not
self
.
checkpoint
or
not
self
.
chief
:
return
if
epoch_id
%
self
.
checkpoint
.
epoch_interval
==
0
and
step_id
%
self
.
checkpoint
.
step_interval
==
0
:
exe
=
executor
.
Executor
(
self
.
place
)
io
.
save_checkpoint
(
executor
=
exe
,
checkpoint_dir
=
self
.
checkpoint
.
checkpoint_dir
,
max_num_checkpoints
=
self
.
checkpoint
.
max_num_checkpoints
,
main_program
=
self
.
train_program
)
def
build_feed_var_list
(
program
,
feed_order
):
if
not
isinstance
(
program
,
framework
.
Program
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录