Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
a7c52100
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
a7c52100
编写于
8月 10, 2020
作者:
G
gongweibao
提交者:
GitHub
8月 10, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix test_hdfs bug. (#26068)
* fix merge3 test=develop
上级
50f149a4
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
218 addition
and
90 deletion
+218
-90
python/paddle/fleet/utils/fs.py
python/paddle/fleet/utils/fs.py
+211
-78
python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
+1
-1
python/paddle/fluid/incubate/checkpoint/checkpoint_saver.py
python/paddle/fluid/incubate/checkpoint/checkpoint_saver.py
+2
-3
python/paddle/fluid/incubate/fleet/collective/__init__.py
python/paddle/fluid/incubate/fleet/collective/__init__.py
+2
-4
python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
+1
-2
python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py
python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py
+1
-2
未找到文件。
python/paddle/fleet/utils/fs.py
浏览文件 @
a7c52100
...
...
@@ -26,6 +26,7 @@ import logging
import
six
import
abc
import
paddle.fluid
as
fluid
from
paddle.fluid
import
core
import
functools
from
pathlib
import
PurePosixPath
,
Path
...
...
@@ -33,7 +34,7 @@ import shutil
__all__
=
[
'FS'
,
'LocalFS'
,
'HDFSClient'
,
'ExecuteError'
,
'FSTimeOut'
,
'FSFileExistsError'
,
'FSFileNotExistsError'
'FSFileExistsError'
,
'FSFileNotExistsError'
,
'FSShellCmdAborted'
]
...
...
@@ -53,6 +54,10 @@ class FSTimeOut(Exception):
pass
class
FSShellCmdAborted
(
ExecuteError
):
pass
class
FS
(
object
):
@
abc
.
abstractmethod
def
ls_dir
(
self
,
fs_path
):
...
...
@@ -95,7 +100,7 @@ class FS(object):
raise
NotImplementedError
@
abc
.
abstractmethod
def
mv
(
self
,
fs_src_path
,
fs_dst_path
):
def
mv
(
self
,
fs_src_path
,
fs_dst_path
,
overwrite
=
False
,
test_exists
=
False
):
raise
NotImplementedError
@
abc
.
abstractmethod
...
...
@@ -103,15 +108,11 @@ class FS(object):
raise
NotImplementedError
@
abc
.
abstractmethod
def
glob
(
self
,
fs_path
):
raise
NotImplementedError
@
abc
.
abstractmethod
def
stat
(
self
,
fs_path
):
def
list_dirs
(
self
,
fs_path
):
raise
NotImplementedError
@
abc
.
abstractmethod
def
walk
(
self
,
fs_path
):
def
touch
(
self
,
fs_path
,
exist_ok
=
True
):
raise
NotImplementedError
...
...
@@ -135,14 +136,8 @@ class LocalFS(FS):
fs_path
)
os
.
system
(
"mkdir -p {}"
.
format
(
fs_path
))
def
is_file
(
self
,
fs_path
):
return
os
.
path
.
isfile
(
fs_path
)
def
is_dir
(
self
,
fs_path
):
return
os
.
path
.
isdir
(
fs_path
)
def
is_exist
(
self
,
fs_path
):
return
os
.
path
.
exists
(
fs_path
)
def
rename
(
self
,
fs_src_path
,
fs_dst_path
):
os
.
rename
(
fs_src_path
,
fs_dst_path
)
def
_rmr
(
self
,
fs_path
):
shutil
.
rmtree
(
fs_path
)
...
...
@@ -159,24 +154,51 @@ class LocalFS(FS):
return
self
.
_rmr
(
fs_path
)
def
rename
(
self
,
fs_src_path
,
fs_dst_path
):
os
.
rename
(
fs_src_path
,
fs_dst_path
)
def
need_upload_download
(
self
):
return
False
def
touch
(
self
,
fs_path
):
return
Path
(
fs_path
).
touch
()
def
is_file
(
self
,
fs_path
):
return
os
.
path
.
isfile
(
fs_path
)
def
is_dir
(
self
,
fs_path
):
return
os
.
path
.
isdir
(
fs_path
)
def
is_exist
(
self
,
fs_path
):
return
os
.
path
.
exists
(
fs_path
)
def
touch
(
self
,
fs_path
,
exist_ok
=
True
):
if
self
.
is_exist
(
fs_path
):
if
exist_ok
:
return
raise
FSFileExistsError
return
Path
(
fs_path
).
touch
(
exist_ok
=
True
)
def
mv
(
self
,
src_path
,
dst_path
):
def
mv
(
self
,
src_path
,
dst_path
,
overwrite
=
False
,
test_exists
=
False
):
if
not
self
.
is_exist
(
src_path
):
raise
FSFileNotExistsError
if
overwrite
and
self
.
is_exist
(
dst_path
):
self
.
delete
(
dst_path
)
if
self
.
is_exist
(
dst_path
):
raise
FSFileExistsError
return
self
.
rename
(
src_path
,
dst_path
)
def
list_dirs
(
self
,
fs_path
):
"""
list directory under fs_path, and only give the pure name, not include the fs_path
"""
if
not
self
.
is_exist
(
fs_path
):
return
[]
dirs
=
[
f
for
f
in
os
.
listdir
(
fs_path
)
if
os
.
path
.
isdir
(
fs_path
+
"/"
+
f
)
]
return
dirs
"""HDFS Utils."""
...
...
@@ -198,6 +220,41 @@ def _handle_errors(f):
return
functools
.
wraps
(
f
)(
handler
)
def
_handle_errors
(
max_time_out
=
None
):
def
decorator
(
f
):
@
functools
.
wraps
(
f
)
def
handler
(
*
args
,
**
kwargs
):
o
=
args
[
0
]
time_out
=
max_time_out
if
time_out
is
None
:
time_out
=
float
(
o
.
_time_out
)
/
1000.0
else
:
time_out
/=
1000.0
inter
=
float
(
o
.
_sleep_inter
)
/
1000.0
start
=
time
.
time
()
last_print_time
=
start
while
True
:
try
:
return
f
(
*
args
,
**
kwargs
)
#important: only ExecuteError need to retry
except
ExecuteError
as
e
:
if
time
.
time
()
-
start
>=
time_out
:
raise
FSTimeOut
(
"args:{} timeout:{}"
.
format
(
args
,
time
.
time
()
-
start
))
time
.
sleep
(
inter
)
if
time
.
time
()
-
last_print_time
>
30
:
print
(
"hadoop operator timeout:args:{} timeout:{}"
.
format
(
args
,
time
.
time
()
-
start
))
last_print_time
=
time
.
time
()
return
handler
return
decorator
class
HDFSClient
(
FS
):
def
__init__
(
self
,
...
...
@@ -216,7 +273,8 @@ class HDFSClient(FS):
if
configs
:
for
k
,
v
in
six
.
iteritems
(
configs
):
self
.
pre_commands
.
append
(
'-D%s=%s'
%
(
k
,
v
))
config_command
=
'-D%s=%s'
%
(
k
,
v
)
self
.
pre_commands
.
append
(
config_command
)
self
.
_time_out
=
time_out
self
.
_sleep_inter
=
sleep_inter
...
...
@@ -225,10 +283,22 @@ class HDFSClient(FS):
r
'\s?responseErrorMsg\s?\:.*, errorCode\:\s?[0-9]+, path\:'
)
def
_run_cmd
(
self
,
cmd
,
redirect_stderr
=
False
):
ret
,
output
=
fluid
.
core
.
shell_execute_cmd
(
cmd
,
0
,
0
,
redirect_stderr
)
return
int
(
ret
),
output
.
splitlines
()
exe_cmd
=
"{} -{}"
.
format
(
self
.
_base_cmd
,
cmd
)
ret
,
output
=
core
.
shell_execute_cmd
(
exe_cmd
,
0
,
0
,
redirect_stderr
)
ret
=
int
(
ret
)
if
ret
==
134
:
raise
FSShellCmdAborted
(
cmd
)
return
ret
,
output
.
splitlines
()
@
_handle_errors
()
def
list_dirs
(
self
,
fs_path
):
if
not
self
.
is_exist
(
fs_path
):
return
[]
@
_handle_errors
dirs
,
files
=
self
.
_ls_dir
(
fs_path
)
return
dirs
@
_handle_errors
()
def
ls_dir
(
self
,
fs_path
):
"""
list directory under fs_path, and only give the pure name, not include the fs_path
...
...
@@ -236,11 +306,14 @@ class HDFSClient(FS):
if
not
self
.
is_exist
(
fs_path
):
return
[],
[]
cmd
=
"{} -ls {}"
.
format
(
self
.
_base_cmd
,
fs_path
)
return
self
.
_ls_dir
(
fs_path
)
def
_ls_dir
(
self
,
fs_path
):
cmd
=
"ls {}"
.
format
(
fs_path
)
ret
,
lines
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
raise
ExecuteError
(
cmd
)
dirs
=
[]
files
=
[]
...
...
@@ -249,9 +322,6 @@ class HDFSClient(FS):
if
len
(
arr
)
!=
8
:
continue
if
fs_path
not
in
arr
[
7
]:
continue
p
=
PurePosixPath
(
arr
[
7
])
if
arr
[
0
][
0
]
==
'd'
:
dirs
.
append
(
p
.
name
)
...
...
@@ -268,18 +338,20 @@ class HDFSClient(FS):
return
None
@
_handle_errors
@
_handle_errors
()
def
is_dir
(
self
,
fs_path
):
if
not
self
.
is_exist
(
fs_path
):
return
False
cmd
=
"{} -test -d {}"
.
format
(
self
.
_base_cmd
,
fs_path
,
redirect_stderr
=
True
)
return
self
.
_is_dir
(
fs_path
)
def
_is_dir
(
self
,
fs_path
):
cmd
=
"test -d {}"
.
format
(
fs_path
,
redirect_stderr
=
True
)
ret
,
lines
=
self
.
_run_cmd
(
cmd
)
if
ret
:
# other error
if
self
.
_test_match
(
lines
)
!=
None
:
raise
ExecuteError
if
self
.
_test_match
(
lines
):
raise
ExecuteError
(
cmd
)
return
False
...
...
@@ -289,94 +361,155 @@ class HDFSClient(FS):
if
not
self
.
is_exist
(
fs_path
):
return
False
return
not
self
.
is_dir
(
fs_path
)
return
not
self
.
_
is_dir
(
fs_path
)
@
_handle_errors
@
_handle_errors
()
def
is_exist
(
self
,
fs_path
):
cmd
=
"
{} -ls {} "
.
format
(
self
.
_base_cmd
,
fs_path
)
cmd
=
"
ls {} "
.
format
(
fs_path
)
ret
,
out
=
self
.
_run_cmd
(
cmd
,
redirect_stderr
=
True
)
if
ret
!=
0
:
for
l
in
out
:
if
"No such file or directory"
in
l
:
return
False
raise
ExecuteError
raise
ExecuteError
(
cmd
)
return
True
@
_handle_errors
# can't retry
def
upload
(
self
,
local_path
,
fs_path
):
if
self
.
is_exist
(
fs_path
):
raise
FSFileExistsError
raise
FSFileExistsError
(
"{} exists"
.
format
(
fs_path
))
local
=
LocalFS
()
if
not
local
.
is_exist
(
local_path
):
raise
FSFileNotExistsError
cmd
=
"{} -put {} {}"
.
format
(
self
.
_base_cmd
,
local_path
,
fs_path
)
ret
,
lines
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
@
_handle_errors
raise
FSFileNotExistsError
(
"{} not exists"
.
format
(
local_path
))
return
self
.
_try_upload
(
local_path
,
fs_path
)
@
_handle_errors
()
def
_try_upload
(
self
,
local_path
,
fs_path
):
cmd
=
"put {} {}"
.
format
(
local_path
,
fs_path
)
ret
=
0
try
:
ret
,
lines
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
(
cmd
)
except
Exception
as
e
:
self
.
delete
(
fs_path
)
raise
e
# can't retry
def
download
(
self
,
fs_path
,
local_path
):
if
self
.
is_exist
(
local_path
):
raise
FSFileExistsError
raise
FSFileExistsError
(
"{} exists"
.
format
(
local_path
))
if
not
self
.
is_exist
(
fs_path
):
raise
FSFileNotExistsError
cmd
=
"{} -get {} {}"
.
format
(
self
.
_base_cmd
,
fs_path
,
local_path
)
ret
,
lines
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
@
_handle_errors
raise
FSFileNotExistsError
(
"{} not exits"
.
format
(
fs_path
))
return
self
.
_try_download
(
fs_path
,
local_path
)
@
_handle_errors
()
def
_try_download
(
self
,
fs_path
,
local_path
):
cmd
=
"get {} {}"
.
format
(
fs_path
,
local_path
)
ret
=
0
try
:
ret
,
lines
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
(
cmd
)
except
Exception
as
e
:
local_fs
=
LocalFS
()
local_fs
.
delete
(
local_path
)
raise
e
@
_handle_errors
()
def
mkdirs
(
self
,
fs_path
):
if
self
.
is_exist
(
fs_path
):
return
cmd
=
"{} -mkdir {}"
.
format
(
self
.
_base_cmd
,
fs_path
)
ret
,
lines
=
self
.
_run_cmd
(
cmd
)
out_hdfs
=
False
cmd
=
"mkdir {} "
.
format
(
fs_path
)
ret
,
out
=
self
.
_run_cmd
(
cmd
,
redirect_stderr
=
True
)
if
ret
!=
0
:
raise
ExecuteError
for
l
in
out
:
if
"No such file or directory"
in
l
:
out_hdfs
=
True
break
if
not
out_hdfs
:
raise
ExecuteError
(
cmd
)
if
out_hdfs
and
not
self
.
is_exist
(
fs_path
):
cmd
=
"mkdir -p {}"
.
format
(
fs_path
)
ret
,
lines
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
(
cmd
)
def
mv
(
self
,
fs_src_path
,
fs_dst_path
,
overwrite
=
False
,
test_exists
=
True
):
if
overwrite
and
self
.
is_exist
(
fs_dst_path
):
self
.
delete
(
fs_dst_path
)
@
_handle_errors
def
mv
(
self
,
fs_src_path
,
fs_dst_path
,
test_exists
=
True
):
if
test_exists
:
if
not
self
.
is_exist
(
fs_src_path
):
raise
FSFileNotExistsError
raise
FSFileNotExistsError
(
"{} is not exists"
.
format
(
fs_src_path
))
if
self
.
is_exist
(
fs_dst_path
):
raise
FSFileExistsError
raise
FSFileExistsError
(
"{} exists already"
.
format
(
fs_src_path
,
fs_dst_path
,
fs_dst_path
))
return
self
.
_try_mv
(
fs_src_path
,
fs_dst_path
)
@
_handle_errors
()
def
_try_mv
(
self
,
fs_src_path
,
fs_dst_path
):
cmd
=
"mv {} {}"
.
format
(
fs_src_path
,
fs_dst_path
)
ret
=
0
try
:
ret
,
_
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
(
cmd
)
except
Exception
as
e
:
if
not
self
.
is_exist
(
fs_src_path
)
and
\
self
.
is_exist
(
fs_dst_path
):
return
raise
e
cmd
=
"{} -mv {} {}"
.
format
(
self
.
_base_cmd
,
fs_src_path
,
fs_dst_path
)
ret
,
_
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
@
_handle_errors
def
_rmr
(
self
,
fs_path
):
cmd
=
"
{} -rmr {}"
.
format
(
self
.
_base_cmd
,
fs_path
)
cmd
=
"
rmr {}"
.
format
(
fs_path
)
ret
,
_
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
raise
ExecuteError
(
cmd
)
@
_handle_errors
def
_rm
(
self
,
fs_path
):
cmd
=
"
{} -rm {}"
.
format
(
self
.
_base_cmd
,
fs_path
)
cmd
=
"
rm {}"
.
format
(
fs_path
)
ret
,
_
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
raise
ExecuteError
(
cmd
)
@
_handle_errors
()
def
delete
(
self
,
fs_path
):
if
not
self
.
is_exist
(
fs_path
):
return
is_dir
=
self
.
is_dir
(
fs_path
)
is_dir
=
self
.
_
is_dir
(
fs_path
)
if
is_dir
:
return
self
.
_rmr
(
fs_path
)
return
self
.
_rm
(
fs_path
)
def
touch
(
self
,
fs_path
,
exist_ok
=
True
):
if
self
.
is_exist
(
fs_path
):
if
exist_ok
:
return
raise
FSFileExistsError
return
self
.
_touchz
(
fs_path
)
@
_handle_errors
()
def
_touchz
(
self
,
fs_path
):
cmd
=
"touchz {}"
.
format
(
fs_path
)
ret
,
_
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
def
need_upload_download
(
self
):
return
True
python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
浏览文件 @
a7c52100
...
...
@@ -24,7 +24,6 @@ from threading import Thread, current_thread
from
contextlib
import
contextmanager
from
paddle.fluid
import
unique_name
,
compiler
from
paddle.fluid.incubate.fleet.utils.hdfs
import
HDFSClient
from
.checkpoint_saver
import
SerializableBase
,
CheckpointSaver
,
PaddleModel
from
paddle.fluid.framework
import
in_dygraph_mode
,
Program
...
...
@@ -306,6 +305,7 @@ class TrainEpochRange(SerializableBase):
if
self
.
_checker
.
ce_test
:
config
=
None
from
paddle.fleet.utils.fs
import
HDFSClient
self
.
_hdfs
=
HDFSClient
(
self
.
_checker
.
hdfs_home
,
config
)
self
.
_cper
=
CheckpointSaver
(
self
.
_hdfs
)
...
...
python/paddle/fluid/incubate/checkpoint/checkpoint_saver.py
浏览文件 @
a7c52100
...
...
@@ -12,8 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from
..fleet.utils.fs
import
FS
,
LocalFS
from
..fleet.utils.hdfs
import
HDFSClient
from
...compiler
import
CompiledProgram
...
...
@@ -81,6 +79,7 @@ class CheckpointSaver(object):
tmp_path
=
"{}.tmp"
.
format
(
real_path
)
saved_path
=
tmp_path
from
paddle.fleet.utils.fs
import
LocalFS
local_fs
=
LocalFS
()
cache_path
=
None
...
...
@@ -121,7 +120,6 @@ class CheckpointSaver(object):
Deserialize objects in slists from path
Return really load path
"""
if
checkpoint_no
is
None
:
max_no
=
self
.
_get_last_checkpoint_no
(
path
)
...
...
@@ -136,6 +134,7 @@ class CheckpointSaver(object):
assert
isinstance
(
checkpoint_no
,
int
)
assert
checkpoint_no
>=
0
from
paddle.fleet.utils.fs
import
LocalFS
local_fs
=
LocalFS
()
if
self
.
_fs
.
need_upload_download
():
cache_path
=
"{}/{}.{}.load_cache"
.
format
(
...
...
python/paddle/fluid/incubate/fleet/collective/__init__.py
浏览文件 @
a7c52100
...
...
@@ -26,7 +26,6 @@ from paddle.fluid.incubate.fleet.base.fleet_base import Mode
from
paddle.fluid.incubate.fleet.base.fleet_base
import
DistributedOptimizer
from
paddle.fluid
import
compiler
from
paddle.fluid.incubate.fleet.utils.fs
import
LocalFS
from
paddle.fluid.incubate.checkpoint.checkpoint_saver
import
PaddleModel
,
CheckpointSaver
import
os
...
...
@@ -143,14 +142,13 @@ class Collective(Fleet):
path
,
trainer_id
,
train_status
,
fs
,
main_program
=
None
,
fs
=
LocalFS
(),
local_cache_path
=
".cache"
,
remain_all_checkpoint
=
True
):
"""
This function save persistables and current epoch num to path.
"""
if
main_program
==
None
:
main_program
=
self
.
_transpiled_program
...
...
@@ -173,8 +171,8 @@ class Collective(Fleet):
path
,
trainer_id
,
train_status
,
fs
,
main_program
=
None
,
fs
=
LocalFS
(),
local_cache_path
=
".cache"
,
ignore_empty
=
True
):
"""
...
...
python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
浏览文件 @
a7c52100
...
...
@@ -20,8 +20,7 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
import
os
import
sys
from
paddle.fluid.incubate.fleet.utils.fs
import
LocalFS
from
paddle.fluid.incubate.fleet.utils.hdfs
import
HDFSClient
from
paddle.fleet.utils.fs
import
LocalFS
,
HDFSClient
import
paddle.fluid.incubate.checkpoint.auto_checkpoint
as
acp
from
paddle.fluid.incubate.checkpoint.checkpoint_saver
import
PaddleModel
from
paddle.fluid.framework
import
program_guard
...
...
python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py
浏览文件 @
a7c52100
...
...
@@ -20,8 +20,7 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
import
os
import
sys
from
paddle.fluid.incubate.fleet.utils.fs
import
LocalFS
from
paddle.fluid.incubate.fleet.utils.hdfs
import
HDFSClient
from
paddle.fleet.utils.fs
import
LocalFS
,
HDFSClient
import
paddle.fluid.incubate.checkpoint.auto_checkpoint
as
acp
from
paddle.fluid.incubate.checkpoint.checkpoint_saver
import
PaddleModel
from
paddle.fluid.framework
import
program_guard
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录