Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
0067a2e4
P
Paddle
项目概览
PaddlePaddle
/
Paddle
接近 2 年 前同步成功
通知
2323
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
0067a2e4
编写于
8月 08, 2020
作者:
G
gongweibao
提交者:
GitHub
8月 08, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Save checkpoint automatically (#25917)
上级
e853ece0
变更
18
展开全部
隐藏空白更改
内联
并排
Showing
18 changed file
with
1933 addition
and
269 deletion
+1933
-269
python/paddle/fluid/executor.py
python/paddle/fluid/executor.py
+7
-0
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+22
-1
python/paddle/fluid/incubate/checkpoint/__init__.py
python/paddle/fluid/incubate/checkpoint/__init__.py
+13
-0
python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
+687
-0
python/paddle/fluid/incubate/checkpoint/checkpoint_saver.py
python/paddle/fluid/incubate/checkpoint/checkpoint_saver.py
+223
-0
python/paddle/fluid/incubate/fleet/collective/__init__.py
python/paddle/fluid/incubate/fleet/collective/__init__.py
+21
-165
python/paddle/fluid/incubate/fleet/utils/fs.py
python/paddle/fluid/incubate/fleet/utils/fs.py
+20
-4
python/paddle/fluid/incubate/fleet/utils/hdfs.py
python/paddle/fluid/incubate/fleet/utils/hdfs.py
+159
-72
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+20
-8
python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py
python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py
+131
-0
python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
+357
-0
python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py
python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py
+77
-0
python/paddle/fluid/tests/unittests/test_checkpoint_saver.py
python/paddle/fluid/tests/unittests/test_checkpoint_saver.py
+57
-0
python/paddle/fluid/tests/unittests/test_desc_clone.py
python/paddle/fluid/tests/unittests/test_desc_clone.py
+2
-1
python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py
python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py
+30
-10
python/paddle/fluid/tests/unittests/test_fs_interface.py
python/paddle/fluid/tests/unittests/test_fs_interface.py
+3
-1
python/paddle/fluid/tests/unittests/test_hdfs.py
python/paddle/fluid/tests/unittests/test_hdfs.py
+103
-7
python/setup.py.in
python/setup.py.in
+1
-0
未找到文件。
python/paddle/fluid/executor.py
浏览文件 @
0067a2e4
...
...
@@ -25,11 +25,13 @@ import six
from
.data_feeder
import
convert_dtype
from
.framework
import
Program
,
default_main_program
,
Variable
,
Operator
,
convert_np_dtype_to_dtype_
from
.
import
core
from
.
import
unique_name
from
.
import
compiler
from
..
import
compat
as
cpt
from
.trainer_factory
import
TrainerFactory
from
.trainer_factory
import
FetchHandlerMonitor
import
copy
from
.incubate.checkpoint
import
auto_checkpoint
as
acp
__all__
=
[
'Executor'
,
'global_scope'
,
'scope_guard'
]
...
...
@@ -559,6 +561,9 @@ class Executor(object):
self
.
_closed
=
False
self
.
pruned_program_scope_caches
=
dict
()
self
.
_auto_checkpoint_name
=
unique_name
.
generate
(
"__auto_checkpoint_executor__"
)
def
_get_scope_cache
(
self
,
program_cache_key
):
return
self
.
scope_caches
.
get
(
program_cache_key
,
None
)
...
...
@@ -1152,6 +1157,8 @@ class Executor(object):
compiled
=
isinstance
(
program
,
compiler
.
CompiledProgram
)
acp
.
_auto_checkpoint
(
self
,
program
)
# For backward compatibility, run directly.
if
not
compiled
:
# In distributed training, the compiled program is saved in Program._graph
...
...
python/paddle/fluid/framework.py
浏览文件 @
0067a2e4
...
...
@@ -2385,12 +2385,29 @@ class Operator(object):
def
_is_optimize_op
(
self
):
op_maker
=
core
.
op_proto_and_checker_maker
OPTIMIZE
=
core
.
op_proto_and_checker_maker
.
OpRole
.
Optimize
if
not
self
.
desc
.
has_attr
(
op_maker
.
kOpRoleAttrName
()):
return
False
op_role
=
self
.
desc
.
attr
(
op_maker
.
kOpRoleAttrName
())
if
op_role
&
int
(
OPTIMIZE
):
return
True
else
:
return
False
def
_is_backward_op
(
self
):
op_maker
=
core
.
op_proto_and_checker_maker
BACKWARD
=
core
.
op_proto_and_checker_maker
.
OpRole
.
Backward
if
not
self
.
desc
.
has_attr
(
op_maker
.
kOpRoleAttrName
()):
return
False
op_role
=
self
.
desc
.
attr
(
op_maker
.
kOpRoleAttrName
())
if
op_role
&
int
(
BACKWARD
):
return
True
return
False
class
Block
(
object
):
"""
...
...
@@ -3942,6 +3959,10 @@ class Program(object):
# appending gradients times
self
.
_appending_grad_times
=
0
# identifier for auto checkpoint
self
.
_auto_checkpoint_name
=
unique_name
.
generate
(
"__auto_checkpoint_program__"
)
# compiled program, i.e. Graph
self
.
_graph
=
None
...
...
python/paddle/fluid/incubate/checkpoint/__init__.py
0 → 100644
浏览文件 @
0067a2e4
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
0 → 100644
浏览文件 @
0067a2e4
此差异已折叠。
点击以展开。
python/paddle/fluid/incubate/checkpoint/checkpoint_saver.py
0 → 100644
浏览文件 @
0067a2e4
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
..fleet.utils.fs
import
FS
,
LocalFS
from
..fleet.utils.hdfs
import
HDFSClient
from
...compiler
import
CompiledProgram
class
SerializableBase
(
object
):
def
serialize
(
self
,
path
):
raise
NotImplementedError
def
deserialize
(
self
,
path
):
raise
NotImplementedError
class
PaddleModel
(
SerializableBase
):
def
__init__
(
self
,
exe
,
program
):
self
.
_exe
=
exe
self
.
_origin_program
=
program
self
.
_program
=
program
if
isinstance
(
program
,
CompiledProgram
):
self
.
_program
=
program
.
_program
self
.
_file_name
=
"_paddle_fleet_param__"
def
serialize
(
self
,
path
):
from
...io
import
save_persistables
save_persistables
(
executor
=
self
.
_exe
,
dirname
=
path
,
main_program
=
self
.
_program
,
filename
=
self
.
_file_name
)
def
deserialize
(
self
,
path
):
from
...io
import
load_persistables
load_persistables
(
executor
=
self
.
_exe
,
dirname
=
path
,
main_program
=
self
.
_program
,
filename
=
self
.
_file_name
)
class
CheckpointSaver
(
object
):
def
__init__
(
self
,
fs
):
self
.
_fs
=
fs
self
.
_checkpoint_prefix
=
"__paddle_checkpoint__"
def
save_checkpoint
(
self
,
path
,
slists
,
trainer_id
=
None
,
local_cache_path
=
".cache"
):
"""
Serialize objects in slists to path
Return really saved path and checkpoint_no
"""
if
not
self
.
_fs
.
is_exist
(
path
):
self
.
_fs
.
mkdirs
(
path
)
else
:
assert
self
.
_fs
.
is_dir
(
path
),
"path:{} must be a directory"
.
format
(
path
)
max_no
=
self
.
_get_last_checkpoint_no
(
path
)
if
max_no
<
0
:
max_no
=
-
1
max_no
+=
1
real_path
=
"{}/{}.{}"
.
format
(
path
,
self
.
_checkpoint_prefix
,
max_no
)
tmp_path
=
"{}.tmp"
.
format
(
real_path
)
saved_path
=
tmp_path
local_fs
=
LocalFS
()
cache_path
=
None
if
self
.
_fs
.
need_upload_download
():
cache_path
=
"{}/{}.{}.saved_cache"
.
format
(
local_cache_path
,
self
.
_checkpoint_prefix
,
max_no
)
if
trainer_id
is
not
None
:
cache_path
=
"{}.{}"
.
format
(
cache_path
,
trainer_id
)
if
not
local_fs
.
is_exist
(
cache_path
):
local_fs
.
mkdirs
(
cache_path
)
else
:
assert
local_fs
.
is_dir
(
cache_path
),
\
"cache path:{} must be a directory"
.
format
(
cache_path
)
saved_path
=
cache_path
for
s
in
slists
:
s
.
serialize
(
saved_path
)
if
self
.
_fs
.
need_upload_download
():
self
.
_fs
.
delete
(
tmp_path
)
self
.
_fs
.
upload
(
cache_path
,
tmp_path
)
local_fs
.
delete
(
cache_path
)
self
.
_fs
.
mv
(
tmp_path
,
real_path
)
return
real_path
,
max_no
def
load_checkpoint
(
self
,
path
,
slists
,
trainer_id
,
local_cache_path
=
".cache"
,
checkpoint_no
=
None
,
ignore_empty
=
True
):
"""
Deserialize objects in slists from path
Return really load path
"""
if
checkpoint_no
is
None
:
max_no
=
self
.
_get_last_checkpoint_no
(
path
)
if
not
ignore_empty
:
assert
max_no
>=
0
,
"Can't find checkpoint"
if
max_no
<
0
:
return
None
checkpoint_no
=
max_no
else
:
assert
isinstance
(
checkpoint_no
,
int
)
assert
checkpoint_no
>=
0
local_fs
=
LocalFS
()
if
self
.
_fs
.
need_upload_download
():
cache_path
=
"{}/{}.{}.load_cache"
.
format
(
local_cache_path
,
self
.
_checkpoint_prefix
,
checkpoint_no
)
if
trainer_id
is
not
None
:
cache_path
=
"{}.{}"
.
format
(
cache_path
,
trainer_id
)
if
not
local_fs
.
is_exist
(
local_cache_path
):
local_fs
.
mkdirs
(
local_cache_path
)
if
local_fs
.
is_exist
(
cache_path
):
local_fs
.
delete
(
cache_path
)
real_path
=
"{}/{}.{}"
.
format
(
path
,
self
.
_checkpoint_prefix
,
checkpoint_no
)
load_path
=
real_path
if
self
.
_fs
.
need_upload_download
():
self
.
_fs
.
download
(
real_path
,
cache_path
)
load_path
=
cache_path
for
s
in
slists
:
s
.
deserialize
(
load_path
)
if
self
.
_fs
.
need_upload_download
()
and
cache_path
:
local_fs
.
delete
(
cache_path
)
return
real_path
def
get_checkpoint_no
(
self
,
root_path
):
a
=
[]
dirs
=
self
.
_fs
.
list_dirs
(
root_path
)
for
d
in
dirs
:
g
=
d
.
split
(
"."
)
if
len
(
g
)
!=
2
:
continue
if
g
[
0
]
!=
self
.
_checkpoint_prefix
:
continue
try
:
n
=
int
(
g
[
1
])
a
.
append
(
n
)
except
:
continue
a
.
sort
()
return
a
def
_get_last_checkpoint_no
(
self
,
root_path
):
"""
only get the first depth
"""
a
=
self
.
get_checkpoint_no
(
root_path
)
if
len
(
a
)
>
0
:
return
a
[
-
1
]
return
-
1
def
clean_redundant_checkpoints
(
self
,
root_path
,
reserved
=
[]):
max_no
=
self
.
_get_last_checkpoint_no
(
root_path
)
if
max_no
<
0
:
return
s
=
set
(
reserved
)
if
len
(
s
)
==
0
:
s
.
add
(
max_no
)
dirs
=
self
.
_fs
.
list_dirs
(
root_path
)
for
d
in
dirs
:
g
=
d
.
split
(
"."
)
if
len
(
g
)
!=
2
:
continue
if
g
[
0
]
!=
self
.
_checkpoint_prefix
:
continue
try
:
n
=
int
(
g
[
1
])
if
n
not
in
s
:
path
=
"{}/{}.{}"
.
format
(
root_path
,
self
.
_checkpoint_prefix
,
n
)
self
.
_fs
.
delete
(
path
)
except
Exception
as
e
:
print
(
e
)
continue
python/paddle/fluid/incubate/fleet/collective/__init__.py
浏览文件 @
0067a2e4
...
...
@@ -27,6 +27,7 @@ from paddle.fluid.incubate.fleet.base.fleet_base import DistributedOptimizer
from
paddle.fluid
import
compiler
from
paddle.fluid.incubate.fleet.utils.fs
import
LocalFS
from
paddle.fluid.incubate.checkpoint.checkpoint_saver
import
PaddleModel
,
CheckpointSaver
import
os
import
sys
...
...
@@ -46,21 +47,6 @@ class DistFCConfig(object):
pass
class
TrainStatus
(
object
):
def
__init__
(
self
,
epoch_no
=-
1
):
# completed epoch
self
.
_epoch_no
=
epoch_no
def
next
(
self
):
return
self
.
_epoch_no
+
1
def
__eq__
(
self
,
t
):
return
self
.
_epoch_no
==
t
.
_epoch_no
def
__ne__
(
self
,
t
):
return
not
self
==
t
class
Collective
(
Fleet
):
def
__init__
(
self
):
super
(
Collective
,
self
).
__init__
(
Mode
.
COLLECTIVE
)
...
...
@@ -152,90 +138,10 @@ class Collective(Fleet):
io
.
save_persistables
(
executor
,
dirname
,
main_program
,
filename
=
filename
)
def
_save_train_status
(
self
,
path
,
train_status
):
d
=
{}
d
[
"epoch_no"
]
=
train_status
.
_epoch_no
file_name
=
"{}/fleet_train_status"
.
format
(
path
)
with
open
(
file_name
,
'w'
)
as
f
:
json
.
dump
(
d
,
f
)
def
_load_train_status
(
self
,
path
):
file_name
=
"{}/fleet_train_status"
.
format
(
path
)
r
=
TrainStatus
()
if
not
os
.
path
.
isfile
(
file_name
):
return
r
d
=
{}
with
open
(
file_name
,
'r'
)
as
f
:
d
=
json
.
load
(
f
)
assert
"epoch_no"
in
d
,
"Can't find epoch_no in dict from train_status file:{}"
.
format
(
d
)
r
.
_epoch_no
=
d
[
"epoch_no"
]
assert
r
.
_epoch_no
>=
0
,
"Data in checkpoint file is not valid:{}"
.
format
(
d
)
return
r
def
_get_last_checkpoint_no
(
self
,
root_path
,
fs
):
"""
only get the first depth
"""
max_no
=
-
1
d
=
{}
dirs
=
fs
.
list_dirs
(
root_path
)
for
d
in
dirs
:
g
=
d
.
split
(
"."
)
if
len
(
g
)
!=
2
:
continue
if
g
[
0
]
!=
"__paddle_fleet_checkpoint__"
:
continue
try
:
n
=
int
(
g
[
1
])
if
n
>
max_no
:
max_no
=
n
except
:
continue
return
max_no
def
clean_redundant_checkpoints
(
self
,
root_path
,
fs
=
LocalFS
(),
checkpoint_num
=
1
):
max_no
=
self
.
_get_last_checkpoint_no
(
root_path
,
fs
)
if
max_no
<
0
:
return
if
checkpoint_num
<
1
:
checkpoint_num
=
1
dirs
=
fs
.
list_dirs
(
root_path
)
for
d
in
dirs
:
g
=
d
.
split
(
"."
)
if
len
(
g
)
!=
2
:
continue
if
g
[
0
]
!=
self
.
_checkpoint_prefix
:
continue
try
:
n
=
int
(
g
[
1
])
if
n
<=
max_no
-
checkpoint_num
:
path
=
"{}/{}.{}"
.
format
(
root_path
,
self
.
_checkpoint_prefix
,
n
)
fs
.
delete
(
path
)
except
Exception
as
e
:
print
(
e
)
continue
def
save_checkpoint
(
self
,
executor
,
path
,
trainer_id
,
train_status
,
main_program
=
None
,
fs
=
LocalFS
(),
...
...
@@ -248,53 +154,25 @@ class Collective(Fleet):
if
main_program
==
None
:
main_program
=
self
.
_transpiled_program
if
not
fs
.
is_exist
(
path
):
fs
.
mkdirs
(
path
)
else
:
assert
fs
.
is_dir
(
path
),
"path:%s must be a directory"
.
format
(
path
)
max_no
=
self
.
_get_last_checkpoint_no
(
path
,
fs
=
fs
)
if
max_no
<
0
:
max_no
=
-
1
real_path
=
"{}/{}.{}"
.
format
(
path
,
self
.
_checkpoint_prefix
,
max_no
+
1
)
tmp_path
=
"{}.tmp"
.
format
(
real_path
)
saved_path
=
tmp_path
local_fs
=
LocalFS
()
cache_path
=
None
if
fs
.
need_upload_download
():
cache_path
=
"{}/{}.{}.saved_cache"
.
format
(
local_cache_path
,
self
.
_checkpoint_prefix
,
max_no
+
1
)
if
not
local_fs
.
is_exist
(
cache_path
):
local_fs
.
mkdirs
(
cache_path
)
else
:
assert
fs
.
is_dir
(
path
),
"cache path:{} must be a directory"
.
format
(
cache_path
)
saved_path
=
cache_path
self
.
save_persistables
(
executor
=
executor
,
dirname
=
saved_path
,
main_program
=
main_program
,
filename
=
self
.
_param_file_name
)
self
.
_save_train_status
(
path
=
saved_path
,
train_status
=
train_status
)
if
fs
.
need_upload_download
():
fs
.
delete
(
tmp_path
)
fs
.
upload
(
cache_path
,
tmp_path
)
fs
.
mv
(
tmp_path
,
real_path
)
m
=
PaddleModel
(
executor
,
main_program
)
t
=
train_status
c
=
CheckpointSaver
(
fs
)
real_path
,
checkpoint_no
=
c
.
save_checkpoint
(
path
=
path
,
slists
=
[
m
,
t
],
trainer_id
=
trainer_id
,
local_cache_path
=
local_cache_path
)
if
not
remain_all_checkpoint
:
self
.
clean_redundant_checkpoints
(
path
)
c
.
clean_redundant_checkpoints
(
path
)
return
real_path
,
checkpoint_no
def
load_checkpoint
(
self
,
executor
,
path
,
trainer_id
,
train_status
,
main_program
=
None
,
fs
=
LocalFS
(),
local_cache_path
=
".cache"
,
...
...
@@ -302,39 +180,17 @@ class Collective(Fleet):
"""
This function load persistables and current epoch num from path.
"""
max_no
=
self
.
_get_last_checkpoint_no
(
path
,
fs
)
if
not
ignore_empty
:
assert
max_no
>=
0
,
"Can't find checkpoint"
if
max_no
<
0
:
return
None
local_fs
=
LocalFS
()
if
fs
.
need_upload_download
():
cache_path
=
"{}/{}.{}.load_cache.{}"
.
format
(
local_cache_path
,
self
.
_checkpoint_prefix
,
max_no
,
trainer_id
)
if
not
local_fs
.
is_exist
(
local_cache_path
):
local_fs
.
mkdirs
(
local_cache_path
)
if
local_fs
.
is_exist
(
cache_path
):
local_fs
.
delete
(
cache_path
)
real_path
=
"{}/{}.{}"
.
format
(
path
,
self
.
_checkpoint_prefix
,
max_no
)
load_path
=
real_path
if
fs
.
need_upload_download
():
fs
.
download
(
real_path
,
cache_path
)
load_path
=
cache_path
if
main_program
==
None
:
main_program
=
self
.
_transpiled_program
io
.
load_persistables
(
executor
=
executor
,
dirname
=
load_path
,
main_program
=
main_program
,
filename
=
self
.
_param_file_name
)
return
self
.
_load_train_status
(
load
_path
)
m
=
PaddleModel
(
executor
,
main_program
)
c
=
CheckpointSaver
(
fs
)
return
c
.
load_checkpoint
(
path
,
[
m
,
train_status
]
,
trainer_id
=
trainer_id
,
ignore_empty
=
ignore_empty
,
local_cache_path
=
local_cache
_path
)
fleet
=
Collective
()
...
...
python/paddle/fluid/incubate/fleet/utils/fs.py
浏览文件 @
0067a2e4
...
...
@@ -45,6 +45,10 @@ class FSTimeOut(Exception):
pass
class
FSShellCmdAborted
(
ExecuteError
):
pass
class
FS
(
object
):
@
abc
.
abstractmethod
def
ls_dir
(
self
,
fs_path
):
...
...
@@ -87,7 +91,7 @@ class FS(object):
raise
NotImplementedError
@
abc
.
abstractmethod
def
mv
(
self
,
fs_src_path
,
fs_dst_path
):
def
mv
(
self
,
fs_src_path
,
fs_dst_path
,
overwrite
=
False
,
test_exists
=
False
):
raise
NotImplementedError
@
abc
.
abstractmethod
...
...
@@ -98,6 +102,10 @@ class FS(object):
def
list_dirs
(
self
,
fs_path
):
raise
NotImplementedError
@
abc
.
abstractmethod
def
touch
(
self
,
fs_path
,
exist_ok
=
True
):
raise
NotImplementedError
class
LocalFS
(
FS
):
def
ls_dir
(
self
,
fs_path
):
...
...
@@ -138,13 +146,21 @@ class LocalFS(FS):
def
is_exist
(
self
,
fs_path
):
return
os
.
path
.
exists
(
fs_path
)
def
touch
(
self
,
fs_path
):
return
Path
(
fs_path
).
touch
()
def
touch
(
self
,
fs_path
,
exist_ok
=
True
):
if
self
.
is_exist
(
fs_path
):
if
exist_ok
:
return
raise
FSFileExistsError
return
Path
(
fs_path
).
touch
(
exist_ok
=
True
)
def
mv
(
self
,
src_path
,
dst_path
):
def
mv
(
self
,
src_path
,
dst_path
,
overwrite
=
False
,
test_exists
=
False
):
if
not
self
.
is_exist
(
src_path
):
raise
FSFileNotExistsError
if
overwrite
and
self
.
is_exist
(
dst_path
):
self
.
delete
(
dst_path
)
if
self
.
is_exist
(
dst_path
):
raise
FSFileExistsError
...
...
python/paddle/fluid/incubate/fleet/utils/hdfs.py
浏览文件 @
0067a2e4
...
...
@@ -26,8 +26,8 @@ import time
import
logging
import
six
from
.
import
fs
from
.fs
import
FS
,
LocalFS
,
FSFileExistsError
,
FSFileNotExistsError
,
ExecuteError
,
FSTimeOut
import
paddle.fluid
as
fluid
from
.fs
import
FS
,
LocalFS
,
FSFileExistsError
,
FSFileNotExistsError
,
ExecuteError
,
FSTimeOut
,
FSShellCmdAborted
from
paddle.fluid
import
core
import
functools
from
pathlib
import
PurePosixPath
,
Path
...
...
@@ -36,21 +36,39 @@ import shutil
__all__
=
[
"HDFSClient"
]
def
_handle_errors
(
f
):
def
handler
(
*
args
,
**
kwargs
):
start
=
time
.
time
()
while
True
:
try
:
return
f
(
*
args
,
**
kwargs
)
except
ExecuteError
as
e
:
o
=
args
[
0
]
def
_handle_errors
(
max_time_out
=
None
):
def
decorator
(
f
):
@
functools
.
wraps
(
f
)
def
handler
(
*
args
,
**
kwargs
):
o
=
args
[
0
]
time_out
=
max_time_out
if
time_out
is
None
:
time_out
=
float
(
o
.
_time_out
)
/
1000.0
inter
=
float
(
o
.
_sleep_inter
)
/
1000.0
if
time
.
time
()
-
start
>=
time_out
:
raise
FSTimeOut
time
.
sleep
(
inter
)
else
:
time_out
/=
1000.0
inter
=
float
(
o
.
_sleep_inter
)
/
1000.0
start
=
time
.
time
()
last_print_time
=
start
while
True
:
try
:
return
f
(
*
args
,
**
kwargs
)
#important: only ExecuteError need to retry
except
ExecuteError
as
e
:
if
time
.
time
()
-
start
>=
time_out
:
raise
FSTimeOut
(
"args:{} timeout:{}"
.
format
(
args
,
time
.
time
()
-
start
))
time
.
sleep
(
inter
)
return
functools
.
wraps
(
f
)(
handler
)
if
time
.
time
()
-
last_print_time
>
30
:
print
(
"hadoop operator timeout:args:{} timeout:{}"
.
format
(
args
,
time
.
time
()
-
start
))
last_print_time
=
time
.
time
()
return
handler
return
decorator
class
HDFSClient
(
FS
):
...
...
@@ -72,6 +90,7 @@ class HDFSClient(FS):
if
configs
:
for
k
,
v
in
six
.
iteritems
(
configs
):
config_command
=
'-D%s=%s'
%
(
k
,
v
)
self
.
pre_commands
.
append
(
config_command
)
self
.
_time_out
=
time_out
self
.
_sleep_inter
=
sleep_inter
...
...
@@ -80,17 +99,22 @@ class HDFSClient(FS):
r
'\s?responseErrorMsg\s?\:.*, errorCode\:\s?[0-9]+, path\:'
)
def
_run_cmd
(
self
,
cmd
,
redirect_stderr
=
False
):
ret
,
output
=
fluid
.
core
.
shell_execute_cmd
(
cmd
,
0
,
0
,
redirect_stderr
)
return
int
(
ret
),
output
.
splitlines
()
exe_cmd
=
"{} -{}"
.
format
(
self
.
_base_cmd
,
cmd
)
ret
,
output
=
core
.
shell_execute_cmd
(
exe_cmd
,
0
,
0
,
redirect_stderr
)
ret
=
int
(
ret
)
if
ret
==
134
:
raise
FSShellCmdAborted
(
cmd
)
return
ret
,
output
.
splitlines
()
@
_handle_errors
()
def
list_dirs
(
self
,
fs_path
):
if
not
self
.
is_exist
(
fs_path
):
return
[]
dirs
,
_
=
self
.
ls_dir
(
fs_path
)
dirs
,
files
=
self
.
_
ls_dir
(
fs_path
)
return
dirs
@
_handle_errors
@
_handle_errors
()
def
ls_dir
(
self
,
fs_path
):
"""
list directory under fs_path, and only give the pure name, not include the fs_path
...
...
@@ -98,11 +122,14 @@ class HDFSClient(FS):
if
not
self
.
is_exist
(
fs_path
):
return
[],
[]
cmd
=
"{} -ls {}"
.
format
(
self
.
_base_cmd
,
fs_path
)
return
self
.
_ls_dir
(
fs_path
)
def
_ls_dir
(
self
,
fs_path
):
cmd
=
"ls {}"
.
format
(
fs_path
)
ret
,
lines
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
raise
ExecuteError
(
cmd
)
dirs
=
[]
files
=
[]
...
...
@@ -111,9 +138,6 @@ class HDFSClient(FS):
if
len
(
arr
)
!=
8
:
continue
if
fs_path
not
in
arr
[
7
]:
continue
p
=
PurePosixPath
(
arr
[
7
])
if
arr
[
0
][
0
]
==
'd'
:
dirs
.
append
(
p
.
name
)
...
...
@@ -130,18 +154,20 @@ class HDFSClient(FS):
return
None
@
_handle_errors
@
_handle_errors
()
def
is_dir
(
self
,
fs_path
):
if
not
self
.
is_exist
(
fs_path
):
return
False
cmd
=
"{} -test -d {}"
.
format
(
self
.
_base_cmd
,
fs_path
,
redirect_stderr
=
True
)
return
self
.
_is_dir
(
fs_path
)
def
_is_dir
(
self
,
fs_path
):
cmd
=
"test -d {}"
.
format
(
fs_path
,
redirect_stderr
=
True
)
ret
,
lines
=
self
.
_run_cmd
(
cmd
)
if
ret
:
# other error
if
self
.
_test_match
(
lines
)
!=
None
:
raise
ExecuteError
if
self
.
_test_match
(
lines
):
raise
ExecuteError
(
cmd
)
return
False
...
...
@@ -151,94 +177,155 @@ class HDFSClient(FS):
if
not
self
.
is_exist
(
fs_path
):
return
False
return
not
self
.
is_dir
(
fs_path
)
return
not
self
.
_
is_dir
(
fs_path
)
@
_handle_errors
@
_handle_errors
()
def
is_exist
(
self
,
fs_path
):
cmd
=
"
{} -ls {} "
.
format
(
self
.
_base_cmd
,
fs_path
)
cmd
=
"
ls {} "
.
format
(
fs_path
)
ret
,
out
=
self
.
_run_cmd
(
cmd
,
redirect_stderr
=
True
)
if
ret
!=
0
:
for
l
in
out
:
if
"No such file or directory"
in
l
:
return
False
raise
ExecuteError
raise
ExecuteError
(
cmd
)
return
True
@
_handle_errors
# can't retry
def
upload
(
self
,
local_path
,
fs_path
):
if
self
.
is_exist
(
fs_path
):
raise
FSFileExistsError
raise
FSFileExistsError
(
"{} exists"
.
format
(
fs_path
))
local
=
LocalFS
()
if
not
local
.
is_exist
(
local_path
):
raise
FSFileNotExistsError
cmd
=
"{} -put {} {}"
.
format
(
self
.
_base_cmd
,
local_path
,
fs_path
)
ret
,
lines
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
@
_handle_errors
raise
FSFileNotExistsError
(
"{} not exists"
.
format
(
local_path
))
return
self
.
_try_upload
(
local_path
,
fs_path
)
@
_handle_errors
()
def
_try_upload
(
self
,
local_path
,
fs_path
):
cmd
=
"put {} {}"
.
format
(
local_path
,
fs_path
)
ret
=
0
try
:
ret
,
lines
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
(
cmd
)
except
Exception
as
e
:
self
.
delete
(
fs_path
)
raise
e
# can't retry
def
download
(
self
,
fs_path
,
local_path
):
if
self
.
is_exist
(
local_path
):
raise
FSFileExistsError
raise
FSFileExistsError
(
"{} exists"
.
format
(
local_path
))
if
not
self
.
is_exist
(
fs_path
):
raise
FSFileNotExistsError
cmd
=
"{} -get {} {}"
.
format
(
self
.
_base_cmd
,
fs_path
,
local_path
)
ret
,
lines
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
@
_handle_errors
raise
FSFileNotExistsError
(
"{} not exits"
.
format
(
fs_path
))
return
self
.
_try_download
(
fs_path
,
local_path
)
@
_handle_errors
()
def
_try_download
(
self
,
fs_path
,
local_path
):
cmd
=
"get {} {}"
.
format
(
fs_path
,
local_path
)
ret
=
0
try
:
ret
,
lines
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
(
cmd
)
except
Exception
as
e
:
local_fs
=
LocalFS
()
local_fs
.
delete
(
local_path
)
raise
e
@
_handle_errors
()
def
mkdirs
(
self
,
fs_path
):
if
self
.
is_exist
(
fs_path
):
return
cmd
=
"{} -mkdir {}"
.
format
(
self
.
_base_cmd
,
fs_path
)
ret
,
lines
=
self
.
_run_cmd
(
cmd
)
out_hdfs
=
False
cmd
=
"mkdir {} "
.
format
(
fs_path
)
ret
,
out
=
self
.
_run_cmd
(
cmd
,
redirect_stderr
=
True
)
if
ret
!=
0
:
raise
ExecuteError
for
l
in
out
:
if
"No such file or directory"
in
l
:
out_hdfs
=
True
break
if
not
out_hdfs
:
raise
ExecuteError
(
cmd
)
if
out_hdfs
and
not
self
.
is_exist
(
fs_path
):
cmd
=
"mkdir -p {}"
.
format
(
fs_path
)
ret
,
lines
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
(
cmd
)
def
mv
(
self
,
fs_src_path
,
fs_dst_path
,
overwrite
=
False
,
test_exists
=
True
):
if
overwrite
and
self
.
is_exist
(
fs_dst_path
):
self
.
delete
(
fs_dst_path
)
@
_handle_errors
def
mv
(
self
,
fs_src_path
,
fs_dst_path
,
test_exists
=
True
):
if
test_exists
:
if
not
self
.
is_exist
(
fs_src_path
):
raise
FSFileNotExistsError
raise
FSFileNotExistsError
(
"{} is not exists"
.
format
(
fs_src_path
))
if
self
.
is_exist
(
fs_dst_path
):
raise
FSFileExistsError
raise
FSFileExistsError
(
"{} exists already"
.
format
(
fs_src_path
,
fs_dst_path
,
fs_dst_path
))
return
self
.
_try_mv
(
fs_src_path
,
fs_dst_path
)
@
_handle_errors
()
def
_try_mv
(
self
,
fs_src_path
,
fs_dst_path
):
cmd
=
"mv {} {}"
.
format
(
fs_src_path
,
fs_dst_path
)
ret
=
0
try
:
ret
,
_
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
(
cmd
)
except
Exception
as
e
:
if
not
self
.
is_exist
(
fs_src_path
)
and
\
self
.
is_exist
(
fs_dst_path
):
return
raise
e
cmd
=
"{} -mv {} {}"
.
format
(
self
.
_base_cmd
,
fs_src_path
,
fs_dst_path
)
ret
,
_
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
@
_handle_errors
def
_rmr
(
self
,
fs_path
):
cmd
=
"
{} -rmr {}"
.
format
(
self
.
_base_cmd
,
fs_path
)
cmd
=
"
rmr {}"
.
format
(
fs_path
)
ret
,
_
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
raise
ExecuteError
(
cmd
)
@
_handle_errors
def
_rm
(
self
,
fs_path
):
cmd
=
"
{} -rm {}"
.
format
(
self
.
_base_cmd
,
fs_path
)
cmd
=
"
rm {}"
.
format
(
fs_path
)
ret
,
_
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
raise
ExecuteError
(
cmd
)
@
_handle_errors
()
def
delete
(
self
,
fs_path
):
if
not
self
.
is_exist
(
fs_path
):
return
is_dir
=
self
.
is_dir
(
fs_path
)
is_dir
=
self
.
_
is_dir
(
fs_path
)
if
is_dir
:
return
self
.
_rmr
(
fs_path
)
return
self
.
_rm
(
fs_path
)
def
touch
(
self
,
fs_path
,
exist_ok
=
True
):
if
self
.
is_exist
(
fs_path
):
if
exist_ok
:
return
raise
FSFileExistsError
return
self
.
_touchz
(
fs_path
)
@
_handle_errors
()
def
_touchz
(
self
,
fs_path
):
cmd
=
"touchz {}"
.
format
(
fs_path
)
ret
,
_
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
def
need_upload_download
(
self
):
return
True
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
0067a2e4
...
...
@@ -86,6 +86,10 @@ if(WIN32)
LIST
(
REMOVE_ITEM TEST_OPS test_ref_by_trainer_id_op
)
endif
()
LIST
(
REMOVE_ITEM TEST_OPS test_auto_checkpoint
)
LIST
(
REMOVE_ITEM TEST_OPS test_auto_checkpoint2
)
LIST
(
REMOVE_ITEM TEST_OPS test_checkpoint_saver
)
if
(
APPLE OR WIN32
)
LIST
(
REMOVE_ITEM TEST_OPS test_hdfs
)
LIST
(
REMOVE_ITEM TEST_OPS test_fs_interface
)
...
...
@@ -190,10 +194,11 @@ function(bash_test_modules TARGET_NAME)
endif
()
set
(
options SERIAL
)
set
(
oneValueArgs
""
)
set
(
multiValueArgs
MODULES
DEPS ENVS LABELS
)
set
(
oneValueArgs
TIMEOUT START_BASH
)
set
(
multiValueArgs DEPS ENVS LABELS
)
cmake_parse_arguments
(
bash_test_modules
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
set
(
timeout 350
)
if
(
${
bash_test_modules_TIMEOUT
}
)
set
(
timeout
${
bash_test_modules_TIMEOUT
}
)
...
...
@@ -204,13 +209,13 @@ function(bash_test_modules TARGET_NAME)
COMMAND
${
CMAKE_COMMAND
}
-E env PYTHONPATH=
${
PADDLE_BINARY_DIR
}
/python
TEST_TARGET_NAME=
${
TARGET_NAME
}
TEST_TIMEOUT=
${
timeout
}
${
bash_test_modules_ENVS
}
WITH_COVERAGE=ON COVERAGE_FILE=
${
PADDLE_BINARY_DIR
}
/python-coverage.data
bash
${
CMAKE_CURRENT_BINARY_DIR
}
/
${
bash_test_modules_
MODULES
}
bash
${
CMAKE_CURRENT_BINARY_DIR
}
/
${
bash_test_modules_
START_BASH
}
WORKING_DIRECTORY
${
CMAKE_CURRENT_BINARY_DIR
}
)
else
()
add_test
(
NAME
${
TARGET_NAME
}
COMMAND
${
CMAKE_COMMAND
}
-E env PYTHONPATH=
${
PADDLE_BINARY_DIR
}
/python
TEST_TARGET_NAME=
${
TARGET_NAME
}
TEST_TIMEOUT=
${
timeout
}
${
bash_test_modules_ENVS
}
bash
${
CMAKE_CURRENT_BINARY_DIR
}
/
${
bash_test_modules_
MODULES
}
bash
${
CMAKE_CURRENT_BINARY_DIR
}
/
${
bash_test_modules_
START_BASH
}
WORKING_DIRECTORY
${
CMAKE_CURRENT_BINARY_DIR
}
)
endif
()
...
...
@@ -397,15 +402,16 @@ if(WITH_DISTRIBUTE)
if
(
NOT APPLE
)
if
(
WITH_GPU
)
# NOTE. test_launch only work in gpu collective mode
bash_test_modules
(
test_launch
MODULES
test_launch.sh ENVS PADDLE_BINARY_DIR=
${
PADDLE_BINARY_DIR
}
)
bash_test_modules
(
test_launch
START_BASH
test_launch.sh ENVS PADDLE_BINARY_DIR=
${
PADDLE_BINARY_DIR
}
)
py_test_modules
(
test_fleet_checkpoint MODULES test_fleet_checkpoint
)
endif
()
bash_test_modules
(
test_launch_ps MODULES test_launch_ps.sh ENVS PADDLE_BINARY_DIR=
${
PADDLE_BINARY_DIR
}
)
bash_test_modules
(
test_fleet_launch MODULES test_fleet_launch.sh ENVS PADDLE_BINARY_DIR=
${
PADDLE_BINARY_DIR
}
)
bash_test_modules
(
test_launch_ps START_BASH test_launch_ps.sh ENVS PADDLE_BINARY_DIR=
${
PADDLE_BINARY_DIR
}
)
bash_test_modules
(
test_fleet_launch START_BASH test_fleet_launch.sh ENVS PADDLE_BINARY_DIR=
${
PADDLE_BINARY_DIR
}
)
set
(
dist_ut_port 20001
)
foreach
(
TEST_OP
${
DIST_TEST_OPS
}
)
bash_test_modules
(
${
TEST_OP
}
MODULES
dist_test.sh SERIAL LABELS
"RUN_TYPE=EXCLUSIVE"
ENVS
"PADDLE_DIST_UT_PORT=
${
dist_ut_port
}
"
)
bash_test_modules
(
${
TEST_OP
}
START_BASH
dist_test.sh SERIAL LABELS
"RUN_TYPE=EXCLUSIVE"
ENVS
"PADDLE_DIST_UT_PORT=
${
dist_ut_port
}
"
)
MATH
(
EXPR dist_ut_port
"
${
dist_ut_port
}
+50"
)
endforeach
(
TEST_OP
)
endif
(
NOT APPLE
)
...
...
@@ -441,6 +447,12 @@ if(NOT WIN32)
set_tests_properties
(
test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450
)
endif
()
if
(
NOT APPLE AND NOT WIN32
)
bash_test_modules
(
test_auto_checkpoint START_BASH dist_test.sh TIMEOUT 600
)
bash_test_modules
(
test_auto_checkpoint2 START_BASH dist_test.sh TIMEOUT 600
)
bash_test_modules
(
test_checkpoint_saver START_BASH dist_test.sh TIMEOUT 600
)
endif
()
add_subdirectory
(
sequence
)
add_subdirectory
(
dygraph_to_static
)
...
...
python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py
0 → 100644
浏览文件 @
0067a2e4
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid.incubate.fleet.base.role_maker
as
role_maker
from
paddle.fluid.incubate.fleet.collective
import
CollectiveOptimizer
,
fleet
import
os
import
sys
from
paddle.fluid.incubate.fleet.utils.fs
import
LocalFS
from
paddle.fluid.incubate.fleet.utils.hdfs
import
HDFSClient
import
paddle.fluid.incubate.checkpoint.auto_checkpoint
as
acp
from
paddle.fluid.incubate.checkpoint.checkpoint_saver
import
PaddleModel
from
paddle.fluid.framework
import
program_guard
from
paddle.fluid
import
unique_name
import
numpy
as
np
from
paddle.io
import
Dataset
,
BatchSampler
,
DataLoader
BATCH_NUM
=
20
BATCH_SIZE
=
16
#IMAGE_SIZE = 128
CLASS_NUM
=
10
USE_GPU
=
False
# whether use GPU to run model
places
=
fluid
.
cuda_places
()
if
USE_GPU
else
fluid
.
cpu_places
()
logger
=
None
def
get_logger
():
global
logger
logger
=
acp
.
_get_logger
(
20
)
return
logger
def
get_random_images_and_labels
(
image_shape
,
label_shape
):
image
=
np
.
random
.
random
(
size
=
image_shape
).
astype
(
'float32'
)
label
=
np
.
random
.
random
(
size
=
label_shape
).
astype
(
'int64'
)
return
image
,
label
def
sample_list_generator_creator
():
def
__reader__
():
for
_
in
range
(
BATCH_NUM
):
sample_list
=
[]
for
_
in
range
(
BATCH_SIZE
):
image
,
label
=
get_random_images_and_labels
([
16
,
16
],
[
1
])
sample_list
.
append
([
image
,
label
])
yield
sample_list
return
__reader__
class
AutoCheckpointBase
(
unittest
.
TestCase
):
def
_init_env
(
self
,
exe
,
main_prog
,
startup_prog
,
minimize
=
True
,
iterable
=
True
):
def
simple_net
():
image
=
fluid
.
data
(
name
=
'image'
,
shape
=
[
-
1
,
16
,
16
],
dtype
=
'float32'
)
label
=
fluid
.
data
(
name
=
'label'
,
shape
=
[
-
1
,
1
],
dtype
=
'int64'
)
fc_tmp
=
fluid
.
layers
.
fc
(
image
,
size
=
CLASS_NUM
)
cross_entropy
=
fluid
.
layers
.
softmax_with_cross_entropy
(
fc_tmp
,
label
)
loss
=
fluid
.
layers
.
reduce_mean
(
cross_entropy
)
sgd
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
1e-3
)
if
minimize
:
sgd
.
minimize
(
loss
)
return
sgd
,
loss
,
image
,
label
with
program_guard
(
main_prog
,
startup_prog
):
sgd
,
loss
,
image
,
label
=
simple_net
()
if
minimize
:
compiled
=
fluid
.
CompiledProgram
(
main_prog
).
with_data_parallel
(
loss_name
=
loss
.
name
)
else
:
compiled
=
None
loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
[
image
,
label
],
capacity
=
64
,
use_double_buffer
=
True
,
iterable
=
iterable
)
loader
.
set_sample_list_generator
(
sample_list_generator_creator
(),
places
[
0
])
if
minimize
:
exe
.
run
(
startup_prog
)
return
compiled
,
loader
,
sgd
,
loss
,
image
,
label
def
_generate
(
self
):
main_prog
=
fluid
.
Program
()
startup_prog
=
fluid
.
Program
()
exe
=
fluid
.
Executor
(
places
[
0
])
return
exe
,
main_prog
,
startup_prog
def
_reset_generator
(
self
):
unique_name
.
generator
=
fluid
.
unique_name
.
UniqueNameGenerator
()
acp
.
generator
=
fluid
.
unique_name
.
UniqueNameGenerator
()
acp
.
g_acp_type
=
None
acp
.
g_checker
=
acp
.
AutoCheckpointChecker
()
acp
.
g_program_attr
=
{}
def
_clear_envs
(
self
):
os
.
environ
.
pop
(
"PADDLE_RUNNING_ENV"
,
None
)
def
_readd_envs
(
self
):
os
.
environ
[
"PADDLE_RUNNING_ENV"
]
=
"PADDLE_EDL_AUTO_CHECKPOINT"
python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
0 → 100644
浏览文件 @
0067a2e4
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid.incubate.fleet.base.role_maker
as
role_maker
from
paddle.fluid.incubate.fleet.collective
import
CollectiveOptimizer
,
fleet
import
os
import
sys
from
paddle.fluid.incubate.fleet.utils.fs
import
LocalFS
from
paddle.fluid.incubate.fleet.utils.hdfs
import
HDFSClient
import
paddle.fluid.incubate.checkpoint.auto_checkpoint
as
acp
from
paddle.fluid.incubate.checkpoint.checkpoint_saver
import
PaddleModel
from
paddle.fluid.framework
import
program_guard
from
paddle.fluid
import
unique_name
import
numpy
as
np
from
paddle.io
import
Dataset
,
BatchSampler
,
DataLoader
from
paddle.fluid.tests.unittests.auto_checkpoint_utils
import
AutoCheckpointBase
,
get_logger
logger
=
get_logger
()
class
AutoCheckPointACLBase
(
AutoCheckpointBase
):
def
setUp
(
self
):
get_logger
()
logger
.
info
(
"enter tests"
)
self
.
_old_environ
=
dict
(
os
.
environ
)
proc_env
=
{
"PADDLE_RUNNING_ENV"
:
"PADDLE_EDL_AUTO_CHECKPOINT"
,
"PADDLE_TRAINER_ID"
:
"0"
,
"PADDLE_RUNNING_PLATFORM"
:
"PADDLE_CLOUD"
,
"PADDLE_JOB_ID"
:
"test_job_auto"
,
"PADDLE_EDL_HDFS_HOME"
:
"/usr/local/hadoop-2.7.7"
,
"PADDLE_EDL_HDFS_NAME"
:
""
,
"PADDLE_EDL_HDFS_UGI"
:
""
,
"PADDLE_EDL_HDFS_CHECKPOINT_PATH"
:
"auto_checkpoint"
,
"PADDLE_EDL_ONLY_FOR_CE_TEST"
:
"1"
,
"PADDLE_EDL_FS_CACHE"
:
".auto_checkpoint_test"
,
"PADDLE_EDL_SAVE_CHECKPOINT_INTER"
:
"0"
}
os
.
environ
.
update
(
proc_env
)
def
tearDown
(
self
):
os
.
environ
.
clear
()
os
.
environ
.
update
(
self
.
_old_environ
)
def
_run_normal
(
self
):
exe
,
main_prog
,
startup_prog
=
self
.
_generate
()
save_dir
=
"./run_save_model"
fs
=
LocalFS
()
fs
.
delete
(
save_dir
)
logger
.
info
(
"begin _run_normal"
)
compiled
,
data_loader
,
optimizer
,
loss
,
image
,
label
=
self
.
_init_env
(
exe
,
main_prog
,
startup_prog
)
for
i
in
range
(
3
):
self
.
assertEqual
(
acp
.
_get_train_epoch_range
(),
None
)
self
.
assertEqual
(
acp
.
g_acp_type
,
None
)
for
data
in
data_loader
():
self
.
assertEqual
(
acp
.
g_acp_type
,
None
)
self
.
assertEqual
(
acp
.
_get_train_epoch_range
(),
None
)
fetch
=
exe
.
run
(
compiled
,
feed
=
data
,
fetch_list
=
[
loss
])
self
.
assertEqual
(
acp
.
g_acp_type
,
None
)
self
.
assertEqual
(
acp
.
_get_train_epoch_range
(),
None
)
m1
=
PaddleModel
(
exe
,
compiled
)
m1
.
serialize
(
save_dir
)
m2
=
PaddleModel
(
exe
,
compiled
)
m2
.
deserialize
(
save_dir
)
logger
.
info
(
"end _run_normal"
)
fs
.
delete
(
save_dir
)
def
_not_use_train
(
self
):
logger
.
info
(
"begin _not_use_train"
)
exe
,
main_prog
,
startup_prog
=
self
.
_generate
()
compiled
,
data_loader
,
optimizer
,
loss
,
image
,
label
=
\
self
.
_init_env
(
exe
,
main_prog
,
startup_prog
)
epochs
=
[]
for
i
in
acp
.
train_epoch_range
(
3
,
0
):
epochs
.
append
(
i
)
for
data
in
data_loader
():
fetch
=
exe
.
run
(
compiled
,
feed
=
data
,
fetch_list
=
[
loss
])
self
.
assertEqual
(
epochs
,
[
0
,
1
,
2
])
logger
.
info
(
"end _not_use_train"
)
def
_run_save_0
(
self
,
break_epoch_no
=
None
):
logger
.
info
(
"begin _run_save_0"
)
fs
=
LocalFS
()
save_dir
=
"./run_save_0"
fs
.
delete
(
save_dir
)
exe
,
main_prog
,
startup_prog
=
self
.
_generate
()
compiled
,
data_loader
,
optimizer
,
loss
,
image
,
label
=
\
self
.
_init_env
(
exe
,
main_prog
,
startup_prog
)
o
=
None
i
=
0
name
=
None
for
i
in
acp
.
train_epoch_range
(
3
,
0
):
o
=
acp
.
_get_train_epoch_range
()
name
=
o
.
name
for
data
in
data_loader
():
fetch
=
exe
.
run
(
compiled
,
feed
=
data
,
fetch_list
=
[
loss
])
self
.
assertEqual
(
len
(
o
.
_exe_status
),
1
)
if
break_epoch_no
is
not
None
:
if
i
==
break_epoch_no
:
break
o
=
acp
.
_get_train_epoch_range
()
assert
o
==
None
,
"now train epoch must not exits now"
if
break_epoch_no
is
None
:
self
.
assertEqual
(
i
,
2
)
else
:
self
.
assertEqual
(
i
,
break_epoch_no
)
fs
.
delete
(
save_dir
)
logger
.
info
(
"end _run_save_0"
)
def
_run_load_0
(
self
,
break_epoch_no
=
None
):
logger
.
info
(
"begin _run_load_0"
)
exe
,
main_prog
,
startup_prog
=
self
.
_generate
()
fs
=
LocalFS
()
save_dir
=
"./run_load_0"
fs
.
delete
(
save_dir
)
compiled
,
data_loader
,
optimizer
,
loss
,
image
,
label
=
self
.
_init_env
(
exe
,
main_prog
,
startup_prog
)
o
=
None
i
=
0
check
=
False
epochs
=
[]
for
i
in
acp
.
train_epoch_range
(
3
,
0
):
epochs
.
append
(
i
)
for
data
in
data_loader
():
fetch
=
exe
.
run
(
compiled
,
feed
=
data
,
fetch_list
=
[
loss
])
o
=
acp
.
_get_train_epoch_range
()
self
.
assertTrue
(
o
==
None
,
"now train epoch must not exits now"
)
self
.
assertEqual
(
i
,
2
)
if
break_epoch_no
is
not
None
:
if
break_epoch_no
==
0
:
self
.
assertEqual
(
epochs
,
[
0
,
1
,
2
])
elif
break_epoch_no
==
1
:
self
.
assertEqual
(
epochs
,
[
1
,
2
])
elif
break_epoch_no
==
2
:
self
.
assertEqual
(
epochs
,
[
2
])
else
:
self
.
assertEqual
(
epochs
,
[
2
])
fs
.
delete
(
save_dir
)
logger
.
info
(
"begin _run_load_0"
)
class
AutoCheckpointTest
(
AutoCheckPointACLBase
):
def
setUp
(
self
):
get_logger
()
logger
.
info
(
"enter tests"
)
self
.
_old_environ
=
dict
(
os
.
environ
)
proc_env
=
{
"PADDLE_RUNNING_ENV"
:
"PADDLE_EDL_AUTO_CHECKPOINT"
,
"PADDLE_TRAINER_ID"
:
"0"
,
"PADDLE_RUNNING_PLATFORM"
:
"PADDLE_CLOUD"
,
"PADDLE_JOB_ID"
:
"test_job_auto_1"
,
"PADDLE_EDL_HDFS_HOME"
:
"/usr/local/hadoop-2.7.7"
,
"PADDLE_EDL_HDFS_NAME"
:
""
,
"PADDLE_EDL_HDFS_UGI"
:
""
,
"PADDLE_EDL_HDFS_CHECKPOINT_PATH"
:
"auto_checkpoint_1"
,
"PADDLE_EDL_ONLY_FOR_CE_TEST"
:
"1"
,
"PADDLE_EDL_FS_CACHE"
:
".auto_checkpoint_test_1"
,
"PADDLE_EDL_SAVE_CHECKPOINT_INTER"
:
"0"
}
os
.
environ
.
update
(
proc_env
)
def
test_normal
(
self
):
logger
.
info
(
"begin test_normal"
)
checker
=
acp
.
_get_checker
()
fs
=
HDFSClient
(
checker
.
hdfs_home
,
None
)
fs
.
delete
(
checker
.
hdfs_checkpoint_path
)
self
.
_clear_envs
()
self
.
_reset_generator
()
self
.
_run_normal
()
self
.
_readd_envs
()
logger
.
info
(
"end test_normal"
)
def
test_basic
(
self
):
logger
.
info
(
"begin test_basic"
)
checker
=
acp
.
_get_checker
()
self
.
assertEqual
(
checker
.
run_env
,
"PADDLE_EDL_AUTO_CHECKPOINT"
)
self
.
assertEqual
(
checker
.
platform
,
"PADDLE_CLOUD"
)
self
.
assertEqual
(
checker
.
save_checkpoint_inter
,
0
)
print
(
checker
)
fs
=
HDFSClient
(
checker
.
hdfs_home
,
None
)
fs
.
delete
(
checker
.
hdfs_checkpoint_path
)
self
.
_reset_generator
()
self
.
_run_save_0
()
self
.
_reset_generator
()
self
.
_run_load_0
()
logger
.
info
(
"end test_basic"
)
def
test_not_use
(
self
):
logger
.
info
(
"begin test_not_use"
)
self
.
_clear_envs
()
self
.
_reset_generator
()
self
.
_not_use_train
()
self
.
_readd_envs
()
logger
.
info
(
"end test_not_use"
)
def
test_multiple
(
self
):
checker
=
acp
.
_get_checker
()
fs
=
HDFSClient
(
checker
.
hdfs_home
,
None
)
fs
.
delete
(
checker
.
hdfs_checkpoint_path
)
self
.
_reset_generator
()
logger
.
info
(
"begin test_multiple"
)
fs
=
LocalFS
()
save_dir
=
"./run_save_0"
fs
.
delete
(
save_dir
)
exe
,
main_prog1
,
startup_prog1
=
self
.
_generate
()
_
,
main_prog2
,
startup_prog2
=
self
.
_generate
()
compiled1
,
data_loader1
,
optimizer1
,
loss1
,
image1
,
label1
=
\
self
.
_init_env
(
exe
,
main_prog1
,
startup_prog1
)
compiled2
,
data_loader2
,
optimizer2
,
loss2
,
image2
,
label2
=
\
self
.
_init_env
(
exe
,
main_prog2
,
startup_prog2
)
o
=
None
epochs
=
[]
for
i
in
acp
.
train_epoch_range
(
3
,
0
):
for
data
in
data_loader1
():
fetch
=
exe
.
run
(
compiled1
,
feed
=
data
,
fetch_list
=
[
loss1
])
for
data
in
data_loader2
():
fetch
=
exe
.
run
(
compiled2
,
feed
=
data
,
fetch_list
=
[
loss2
])
o
=
acp
.
_get_train_epoch_range
()
self
.
assertEqual
(
len
(
o
.
_exe_status
),
2
)
print
(
o
.
_exe_status
)
epochs
.
append
(
i
)
o
=
acp
.
_get_train_epoch_range
()
self
.
assertTrue
(
o
==
None
,
"now train epoch must not exits now"
)
self
.
assertEqual
(
i
,
2
)
self
.
assertEqual
(
epochs
,
[
0
,
1
,
2
])
fs
.
delete
(
save_dir
)
logger
.
info
(
"end test_multiple"
)
def
test_distributed_basic
(
self
):
checker
=
acp
.
_get_checker
()
fs
=
HDFSClient
(
checker
.
hdfs_home
,
None
)
fs
.
delete
(
checker
.
hdfs_checkpoint_path
)
self
.
_reset_generator
()
logger
.
info
(
"begin test_distributed_basic"
)
fs
=
LocalFS
()
save_dir
=
"./run_save_0"
fs
.
delete
(
save_dir
)
#basic
exe
,
main_prog
,
startup_prog
=
self
.
_generate
()
compiled
,
data_loader
,
optimizer
,
loss
,
image
,
label
=
\
self
.
_init_env
(
exe
,
main_prog
,
startup_prog
,
minimize
=
False
)
#fleet
os
.
environ
[
"TRAINING_ROLE"
]
=
"TRAINER"
os
.
environ
[
"PADDLE_TRAINER_ID"
]
=
"0"
os
.
environ
[
"PADDLE_TRAINER_ENDPOINTS"
]
=
"127.0.0.1:6070"
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
with
fluid
.
program_guard
(
main_prog
,
startup_prog
):
dist_optimizer
=
fleet
.
distributed_optimizer
(
optimizer
)
dist_optimizer
.
minimize
(
loss
)
exe
.
run
(
startup_prog
)
o
=
None
i
=
0
name
=
None
for
i
in
acp
.
train_epoch_range
(
3
,
0
):
o
=
acp
.
_get_train_epoch_range
()
name
=
o
.
name
logger
.
info
(
"_run_save_0 name:{} epoch_no:{}"
.
format
(
o
.
name
,
i
))
for
data
in
data_loader
():
fetch
=
exe
.
run
(
fleet
.
main_program
,
feed
=
data
,
fetch_list
=
[
loss
])
self
.
assertEqual
(
len
(
o
.
_exe_status
),
1
)
o
=
acp
.
_get_train_epoch_range
()
assert
o
==
None
,
"now train epoch must not exits now"
self
.
assertEqual
(
i
,
2
)
fs
.
delete
(
save_dir
)
logger
.
info
(
"end test_distributed_basic"
)
def
test_checker
(
self
):
os
.
environ
.
pop
(
"PADDLE_JOB_ID"
,
None
)
try
:
checker
=
AutoCheckpointChecker
()
self
.
assertFalse
(
True
)
except
Exception
as
e
:
pass
os
.
environ
[
"PADDLE_JOB_ID"
]
=
"test_job_auto_1"
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py
0 → 100644
浏览文件 @
0067a2e4
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid.incubate.fleet.base.role_maker
as
role_maker
from
paddle.fluid.incubate.fleet.collective
import
CollectiveOptimizer
,
fleet
import
os
import
sys
from
paddle.fluid.incubate.fleet.utils.fs
import
LocalFS
from
paddle.fluid.incubate.fleet.utils.hdfs
import
HDFSClient
import
paddle.fluid.incubate.checkpoint.auto_checkpoint
as
acp
from
paddle.fluid.incubate.checkpoint.checkpoint_saver
import
PaddleModel
from
paddle.fluid.framework
import
program_guard
from
paddle.fluid
import
unique_name
import
numpy
as
np
from
paddle.io
import
Dataset
,
BatchSampler
,
DataLoader
from
paddle.fluid.tests.unittests.auto_checkpoint_utils
import
AutoCheckpointBase
,
get_logger
from
paddle.fluid.tests.unittests.test_auto_checkpoint
import
AutoCheckPointACLBase
logger
=
get_logger
()
class
AutoCheckpointTest2
(
AutoCheckPointACLBase
):
def
setUp
(
self
):
get_logger
()
logger
.
info
(
"enter tests"
)
self
.
_old_environ
=
dict
(
os
.
environ
)
proc_env
=
{
"PADDLE_RUNNING_ENV"
:
"PADDLE_EDL_AUTO_CHECKPOINT"
,
"PADDLE_TRAINER_ID"
:
"0"
,
"PADDLE_RUNNING_PLATFORM"
:
"PADDLE_CLOUD"
,
"PADDLE_JOB_ID"
:
"test_job_auto_2"
,
"PADDLE_EDL_HDFS_HOME"
:
"/usr/local/hadoop-2.7.7"
,
"PADDLE_EDL_HDFS_NAME"
:
""
,
"PADDLE_EDL_HDFS_UGI"
:
""
,
"PADDLE_EDL_HDFS_CHECKPOINT_PATH"
:
"auto_checkpoint_2"
,
"PADDLE_EDL_ONLY_FOR_CE_TEST"
:
"1"
,
"PADDLE_EDL_FS_CACHE"
:
".auto_checkpoint_test_2"
,
"PADDLE_EDL_SAVE_CHECKPOINT_INTER"
:
"0"
}
os
.
environ
.
update
(
proc_env
)
def
test_corner_epoch_no
(
self
):
logger
.
info
(
"begin test_corener_epoch_no"
)
checker
=
acp
.
_get_checker
()
fs
=
HDFSClient
(
checker
.
hdfs_home
,
None
)
for
i
in
range
(
3
):
fs
.
delete
(
checker
.
hdfs_checkpoint_path
)
self
.
_reset_generator
()
self
.
_run_save_0
(
break_epoch_no
=
i
)
self
.
_reset_generator
()
self
.
_run_load_0
(
break_epoch_no
=
i
)
fs
.
delete
(
checker
.
hdfs_checkpoint_path
)
logger
.
info
(
"end test_corener_epoch_no"
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_checkpoint_saver.py
0 → 100644
浏览文件 @
0067a2e4
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
paddle.fluid
as
fluid
import
paddle.fluid.incubate.fleet.base.role_maker
as
role_maker
from
paddle.fluid.incubate.fleet.collective
import
CollectiveOptimizer
,
fleet
from
paddle.fluid.incubate.checkpoint.auto_checkpoint
import
ExeTrainStatus
from
paddle.fluid.incubate.checkpoint.checkpoint_saver
import
CheckpointSaver
import
os
import
sys
from
paddle.fluid.incubate.fleet.utils.fs
import
LocalFS
from
paddle.fluid.incubate.fleet.utils.hdfs
import
HDFSClient
from
paddle.fluid.incubate.checkpoint.checkpoint_saver
import
CheckpointSaver
class
CheckpointerSaverTest
(
unittest
.
TestCase
):
def
test
(
self
):
fs
=
HDFSClient
(
"/usr/local/hadoop-2.7.7"
,
None
)
dir_path
=
"./checkpointsaver_test"
fs
.
delete
(
dir_path
)
s
=
CheckpointSaver
(
fs
)
fs
.
mkdirs
(
"{}/exe.exe"
.
format
(
dir_path
))
fs
.
mkdirs
(
"{}/exe.1"
.
format
(
dir_path
))
fs
.
mkdirs
(
"{}/exe"
.
format
(
dir_path
))
a
=
s
.
get_checkpoint_no
(
dir_path
)
self
.
assertEqual
(
len
(
a
),
0
)
fs
.
mkdirs
(
"{}/__paddle_checkpoint__.0"
.
format
(
dir_path
))
fs
.
mkdirs
(
"{}/__paddle_checkpoint__.exe"
.
format
(
dir_path
))
a
=
s
.
get_checkpoint_no
(
dir_path
)
self
.
assertEqual
(
len
(
a
),
1
)
s
.
clean_redundant_checkpoints
(
dir_path
)
s
.
clean_redundant_checkpoints
(
dir_path
)
fs
.
delete
(
dir_path
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_desc_clone.py
浏览文件 @
0067a2e4
...
...
@@ -170,7 +170,8 @@ def program_equal(a, b):
k
))
return
False
assert
(
len
(
a
.
blocks
)
==
len
(
b
.
blocks
))
elif
k
==
'_auto_checkpoint_name'
:
continue
elif
(
v
!=
b
.
__dict__
[
k
]):
raise
ValueError
(
"In program_equal not equal:{0}
\n
"
.
format
(
k
))
...
...
python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py
浏览文件 @
0067a2e4
...
...
@@ -15,12 +15,15 @@
import
unittest
import
paddle.fluid
as
fluid
import
paddle.fluid.incubate.fleet.base.role_maker
as
role_maker
from
paddle.fluid.incubate.fleet.collective
import
CollectiveOptimizer
,
fleet
,
TrainStatus
from
paddle.fluid.incubate.fleet.collective
import
CollectiveOptimizer
,
fleet
from
paddle.fluid.incubate.checkpoint.auto_checkpoint
import
ExeTrainStatus
from
paddle.fluid.incubate.checkpoint.checkpoint_saver
import
CheckpointSaver
import
os
import
sys
from
paddle.fluid.incubate.fleet.utils.fs
import
LocalFS
from
paddle.fluid.incubate.fleet.utils.hdfs
import
HDFSClient
from
paddle.fluid.incubate.checkpoint.checkpoint_saver
import
CheckpointSaver
class
FleetTest
(
unittest
.
TestCase
):
...
...
@@ -49,24 +52,35 @@ class FleetTest(unittest.TestCase):
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
())
exe
.
run
(
fluid
.
default_startup_program
())
status
=
TrainStatus
(
2
)
fleet
.
save_checkpoint
(
exe
,
dir_path
,
train_status
=
status
,
fs
=
fs
)
n1
=
fleet
.
_get_last_checkpoint_no
(
dir_path
,
fs
=
fs
)
status
=
ExeTrainStatus
()
status
.
epoch_no
=
2
_
,
n1
=
fleet
.
save_checkpoint
(
exe
,
dir_path
,
trainer_id
=
0
,
train_status
=
status
,
fs
=
fs
)
status2
=
fleet
.
load_checkpoint
(
exe
,
dir_path
,
trainer_id
=
0
,
fs
=
fs
)
status2
=
ExeTrainStatus
()
fleet
.
load_checkpoint
(
exe
,
dir_path
,
trainer_id
=
0
,
fs
=
fs
,
train_status
=
status2
)
self
.
assertEqual
(
status2
,
status
)
fleet
.
save_checkpoint
(
exe
,
dir_path
,
train_status
=
status
,
fs
=
fs
)
n2
=
fleet
.
_get_last_checkpoint_no
(
dir_path
,
fs
=
fs
)
_
,
n2
=
fleet
.
save_checkpoint
(
exe
,
dir_path
,
trainer_id
=
0
,
train_status
=
status
,
fs
=
fs
,
remain_all_checkpoint
=
False
)
self
.
assertEqual
(
n2
,
n1
+
1
)
fleet
.
clean_redundant_checkpoints
(
dir_path
,
fs
=
fs
)
c
=
CheckpointSaver
(
fs
)
cp_nos
=
c
.
get_checkpoint_no
(
dir_path
)
assert
len
(
cp_nos
)
==
1
# cleanup all others
# unnormal
# test remain_all_checkpoint
fleet
.
save_checkpoint
(
exe
,
dir_path
,
trainer_id
=
0
,
train_status
=
status
,
fs
=
fs
,
remain_all_checkpoint
=
False
)
...
...
@@ -79,6 +93,7 @@ class FleetTest(unittest.TestCase):
fleet
.
save_checkpoint
(
exe
,
dir_path
,
trainer_id
=
0
,
train_status
=
status
,
fs
=
fs
,
cache_path
=
cache_path
)
...
...
@@ -88,8 +103,13 @@ class FleetTest(unittest.TestCase):
# can't load under a file
try
:
status2
=
fleet
.
load_checkpoint
(
exe
,
dir_path
,
trainer_id
=
0
,
fs
=
fs
,
cache_path
=
cache_path
)
fleet
.
load_checkpoint
(
exe
,
dir_path
,
trainer_id
=
0
,
train_status
=
status2
,
fs
=
fs
,
cache_path
=
cache_path
)
self
.
assertFalse
(
True
)
except
:
pass
...
...
python/paddle/fluid/tests/unittests/test_fs_interface.py
浏览文件 @
0067a2e4
...
...
@@ -15,7 +15,7 @@
import
unittest
import
paddle.fluid
as
fluid
import
paddle.fluid.incubate.fleet.base.role_maker
as
role_maker
from
paddle.fluid.incubate.fleet.collective
import
CollectiveOptimizer
,
fleet
,
TrainStatus
from
paddle.fluid.incubate.fleet.collective
import
CollectiveOptimizer
,
fleet
import
os
import
sys
import
inspect
...
...
@@ -38,6 +38,8 @@ class FSTest(unittest.TestCase):
func
(
a
)
elif
len
(
args
)
==
3
:
func
(
a
,
a
)
elif
len
(
args
)
==
5
:
func
(
a
,
a
,
a
,
a
)
print
(
"args:"
,
args
,
len
(
args
),
"func:"
,
func
)
self
.
assertFalse
(
True
)
except
NotImplementedError
as
e
:
...
...
python/paddle/fluid/tests/unittests/test_hdfs.py
浏览文件 @
0067a2e4
...
...
@@ -15,7 +15,7 @@
import
unittest
import
paddle.fluid
as
fluid
import
paddle.fluid.incubate.fleet.base.role_maker
as
role_maker
from
paddle.fluid.incubate.fleet.collective
import
CollectiveOptimizer
,
fleet
,
TrainStatus
from
paddle.fluid.incubate.fleet.collective
import
CollectiveOptimizer
,
fleet
import
os
import
sys
...
...
@@ -57,6 +57,12 @@ class FSTest(unittest.TestCase):
fs
.
delete
(
dir_path
)
self
.
assertTrue
(
not
fs
.
is_exist
(
dir_path
))
fs
.
mkdirs
(
dir_path
)
fs
.
mkdirs
(
new_dir_path
)
fs
.
mv
(
dir_path
,
new_dir_path
,
overwrite
=
True
)
self
.
assertTrue
(
not
fs
.
is_exist
(
dir_path
))
self
.
assertTrue
(
fs
.
is_exist
(
new_dir_path
))
def
_test_touch_file
(
self
,
fs
):
file_path
=
os
.
path
.
abspath
(
"./test_file"
)
...
...
@@ -104,6 +110,35 @@ class FSTest(unittest.TestCase):
fs
.
delete
(
dst_file
)
fs
.
delete
(
src_file
)
def
_test_try_download
(
self
,
fs
):
src_file
=
os
.
path
.
abspath
(
"./test_try_download.src"
)
dst_file
=
os
.
path
.
abspath
(
"./test_try_download.dst"
)
fs
.
delete
(
dst_file
)
fs
.
delete
(
src_file
)
try
:
fs
.
_try_download
(
src_file
,
dst_file
)
self
.
assertFalse
(
True
)
except
Exception
as
e
:
pass
fs
.
delete
(
dst_file
)
fs
.
delete
(
src_file
)
def
_test_try_upload
(
self
,
fs
):
src_file
=
os
.
path
.
abspath
(
"./test_try_upload.src"
)
dst_file
=
os
.
path
.
abspath
(
"./test_try_uolpad.dst"
)
try
:
fs
.
_try_upload
(
src_file
,
dst_file
)
self
.
assertFalse
(
True
)
except
Exception
as
e
:
pass
fs
.
delete
(
dst_file
)
fs
.
delete
(
src_file
)
def
_test_download
(
self
,
fs
):
src_file
=
os
.
path
.
abspath
(
"./test_download.src"
)
dst_file
=
os
.
path
.
abspath
(
"./test_download.dst"
)
...
...
@@ -138,8 +173,27 @@ class FSTest(unittest.TestCase):
fs
.
mkdirs
(
dir_name
)
fs
.
mkdirs
(
dir_name
)
def
_test_rm
(
self
,
fs
):
dir_name
=
"./test_rm_no_exist.flag"
fs
.
delete
(
dir_name
)
try
:
fs
.
_rmr
(
dir_name
)
self
.
assertFalse
(
True
)
except
Exception
as
e
:
pass
try
:
fs
.
_rm
(
dir_name
)
self
.
assertFalse
(
True
)
except
Exception
as
e
:
pass
def
test_exists
(
self
):
fs
=
HDFSClient
(
"/usr/local/hadoop-2.7.7/"
,
None
,
time_out
=
15
*
1000
)
fs
=
HDFSClient
(
"/usr/local/hadoop-2.7.7/"
,
None
,
time_out
=
15
*
1000
,
sleep_inter
=
100
)
self
.
assertFalse
(
fs
.
is_exist
(
os
.
path
.
abspath
(
"./xxxx"
)))
self
.
assertFalse
(
fs
.
is_dir
(
os
.
path
.
abspath
(
"./xxxx"
)))
self
.
assertTrue
(
fs
.
is_dir
(
os
.
path
.
abspath
(
"./xxx/.."
)))
...
...
@@ -149,27 +203,39 @@ class FSTest(unittest.TestCase):
dirs
,
files
=
fs
.
ls_dir
(
os
.
path
.
abspath
(
"./xxx/.."
))
def
test_hdfs
(
self
):
fs
=
HDFSClient
(
"/usr/local/hadoop-2.7.7/"
,
None
,
time_out
=
15
*
1000
)
fs
=
HDFSClient
(
"/usr/local/hadoop-2.7.7/"
,
None
,
time_out
=
15
*
1000
,
sleep_inter
=
100
)
self
.
_test_rm
(
fs
)
self
.
_test_touch
(
fs
)
self
.
_test_dirs
(
fs
)
self
.
_test_upload
(
fs
)
self
.
_test_download
(
fs
)
self
.
_test_mkdirs
(
fs
)
self
.
_test_list_dir
(
fs
)
self
.
_test_try_upload
(
fs
)
self
.
_test_try_download
(
fs
)
def
test_local
(
self
):
fs
=
LocalFS
()
self
.
_test_rm
(
fs
)
self
.
_test_touch
(
fs
)
self
.
_test_dirs
(
fs
)
self
.
_test_touch_file
(
fs
)
self
.
_test_mkdirs
(
fs
)
self
.
_test_list_dir
(
fs
)
self
.
_test_try_upload
(
fs
)
self
.
_test_try_download
(
fs
)
def
test_timeout
(
self
):
fs
=
HDFSClient
(
"/usr/local/hadoop-2.7.7/"
,
None
,
time_out
=
6
*
1000
,
sleep_inter
=
20
00
)
sleep_inter
=
1
00
)
src
=
"hdfs_test_timeout"
dst
=
"new_hdfs_test_timeout"
fs
.
delete
(
dst
)
...
...
@@ -190,7 +256,11 @@ class FSTest(unittest.TestCase):
print
(
"second mv ret:{} output:{}"
.
format
(
ret
,
output
))
def
test_is_dir
(
self
):
fs
=
HDFSClient
(
"/usr/local/hadoop-2.7.7/"
,
None
,
time_out
=
15
*
1000
)
fs
=
HDFSClient
(
"/usr/local/hadoop-2.7.7/"
,
None
,
time_out
=
15
*
1000
,
sleep_inter
=
100
)
self
.
assertFalse
(
fs
.
is_dir
(
"./test_hdfs.py"
))
s
=
"""
java.io.IOException: Input/output error
...
...
@@ -212,12 +282,38 @@ java.io.IOException: Input/output error
def
test_config
(
self
):
config
=
{
"fs.default.name"
:
"hdfs://xxx"
,
"hadoop.job.ugi"
:
"ugi"
}
fs
=
HDFSClient
(
"/usr/local/hadoop-2.7.7/"
,
config
,
time_out
=
15
*
1000
)
fs
=
HDFSClient
(
"/usr/local/hadoop-2.7.7/"
,
config
,
time_out
=
15
*
1000
,
sleep_inter
=
100
)
def
_test_list_dir
(
self
,
fs
):
fs
=
HDFSClient
(
"/usr/local/hadoop-2.7.7/"
,
None
,
time_out
=
15
*
1000
)
fs
=
HDFSClient
(
"/usr/local/hadoop-2.7.7/"
,
None
,
time_out
=
15
*
1000
,
sleep_inter
=
100
)
fs
.
ls_dir
(
"test_not_exists"
)
def
_test_touch
(
self
,
fs
):
path
=
"./touch.flag"
fs
.
touch
(
path
,
exist_ok
=
True
)
try
:
fs
.
touch
(
"./touch.flag"
,
exist_ok
=
False
)
self
.
assertFalse
(
0
,
"can't reach here"
)
except
FSFileExistsError
as
e
:
pass
try
:
fs
.
_touchz
(
"./touch.flag"
)
self
.
assertFalse
(
True
,
"can't reach here"
)
except
Exception
as
e
:
pass
self
.
assertFalse
(
fs
.
is_dir
(
path
))
fs
.
delete
(
path
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/setup.py.in
浏览文件 @
0067a2e4
...
...
@@ -178,6 +178,7 @@ packages=['paddle',
'paddle.fluid.incubate',
'paddle.fluid.incubate.data_generator',
'paddle.fluid.incubate.fleet',
'paddle.fluid.incubate.checkpoint',
'paddle.fluid.incubate.fleet.base',
'paddle.fluid.incubate.fleet.parameter_server',
'paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler',
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录