Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
a7c52100
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
a7c52100
编写于
8月 10, 2020
作者:
G
gongweibao
提交者:
GitHub
8月 10, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix test_hdfs bug. (#26068)
* fix merge3 test=develop
上级
50f149a4
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
218 addition
and
90 deletion
+218
-90
python/paddle/fleet/utils/fs.py
python/paddle/fleet/utils/fs.py
+211
-78
python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
+1
-1
python/paddle/fluid/incubate/checkpoint/checkpoint_saver.py
python/paddle/fluid/incubate/checkpoint/checkpoint_saver.py
+2
-3
python/paddle/fluid/incubate/fleet/collective/__init__.py
python/paddle/fluid/incubate/fleet/collective/__init__.py
+2
-4
python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
+1
-2
python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py
python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py
+1
-2
未找到文件。
python/paddle/fleet/utils/fs.py
浏览文件 @
a7c52100
...
...
@@ -26,6 +26,7 @@ import logging
import
six
import
abc
import
paddle.fluid
as
fluid
from
paddle.fluid
import
core
import
functools
from
pathlib
import
PurePosixPath
,
Path
...
...
@@ -33,7 +34,7 @@ import shutil
__all__
=
[
'FS'
,
'LocalFS'
,
'HDFSClient'
,
'ExecuteError'
,
'FSTimeOut'
,
'FSFileExistsError'
,
'FSFileNotExistsError'
'FSFileExistsError'
,
'FSFileNotExistsError'
,
'FSShellCmdAborted'
]
...
...
@@ -53,6 +54,10 @@ class FSTimeOut(Exception):
pass
class
FSShellCmdAborted
(
ExecuteError
):
pass
class
FS
(
object
):
@
abc
.
abstractmethod
def
ls_dir
(
self
,
fs_path
):
...
...
@@ -95,7 +100,7 @@ class FS(object):
raise
NotImplementedError
@
abc
.
abstractmethod
def
mv
(
self
,
fs_src_path
,
fs_dst_path
):
def
mv
(
self
,
fs_src_path
,
fs_dst_path
,
overwrite
=
False
,
test_exists
=
False
):
raise
NotImplementedError
@
abc
.
abstractmethod
...
...
@@ -103,15 +108,11 @@ class FS(object):
raise
NotImplementedError
@
abc
.
abstractmethod
def
glob
(
self
,
fs_path
):
raise
NotImplementedError
@
abc
.
abstractmethod
def
stat
(
self
,
fs_path
):
def
list_dirs
(
self
,
fs_path
):
raise
NotImplementedError
@
abc
.
abstractmethod
def
walk
(
self
,
fs_path
):
def
touch
(
self
,
fs_path
,
exist_ok
=
True
):
raise
NotImplementedError
...
...
@@ -135,14 +136,8 @@ class LocalFS(FS):
fs_path
)
os
.
system
(
"mkdir -p {}"
.
format
(
fs_path
))
def
is_file
(
self
,
fs_path
):
return
os
.
path
.
isfile
(
fs_path
)
def
is_dir
(
self
,
fs_path
):
return
os
.
path
.
isdir
(
fs_path
)
def
is_exist
(
self
,
fs_path
):
return
os
.
path
.
exists
(
fs_path
)
def
rename
(
self
,
fs_src_path
,
fs_dst_path
):
os
.
rename
(
fs_src_path
,
fs_dst_path
)
def
_rmr
(
self
,
fs_path
):
shutil
.
rmtree
(
fs_path
)
...
...
@@ -159,24 +154,51 @@ class LocalFS(FS):
return
self
.
_rmr
(
fs_path
)
def
rename
(
self
,
fs_src_path
,
fs_dst_path
):
os
.
rename
(
fs_src_path
,
fs_dst_path
)
def
need_upload_download
(
self
):
return
False
def
touch
(
self
,
fs_path
):
return
Path
(
fs_path
).
touch
()
def
is_file
(
self
,
fs_path
):
return
os
.
path
.
isfile
(
fs_path
)
def
is_dir
(
self
,
fs_path
):
return
os
.
path
.
isdir
(
fs_path
)
def
is_exist
(
self
,
fs_path
):
return
os
.
path
.
exists
(
fs_path
)
def
touch
(
self
,
fs_path
,
exist_ok
=
True
):
if
self
.
is_exist
(
fs_path
):
if
exist_ok
:
return
raise
FSFileExistsError
return
Path
(
fs_path
).
touch
(
exist_ok
=
True
)
def
mv
(
self
,
src_path
,
dst_path
):
def
mv
(
self
,
src_path
,
dst_path
,
overwrite
=
False
,
test_exists
=
False
):
if
not
self
.
is_exist
(
src_path
):
raise
FSFileNotExistsError
if
overwrite
and
self
.
is_exist
(
dst_path
):
self
.
delete
(
dst_path
)
if
self
.
is_exist
(
dst_path
):
raise
FSFileExistsError
return
self
.
rename
(
src_path
,
dst_path
)
def
list_dirs
(
self
,
fs_path
):
"""
list directory under fs_path, and only give the pure name, not include the fs_path
"""
if
not
self
.
is_exist
(
fs_path
):
return
[]
dirs
=
[
f
for
f
in
os
.
listdir
(
fs_path
)
if
os
.
path
.
isdir
(
fs_path
+
"/"
+
f
)
]
return
dirs
"""HDFS Utils."""
...
...
@@ -198,6 +220,41 @@ def _handle_errors(f):
return
functools
.
wraps
(
f
)(
handler
)
def
_handle_errors
(
max_time_out
=
None
):
def
decorator
(
f
):
@
functools
.
wraps
(
f
)
def
handler
(
*
args
,
**
kwargs
):
o
=
args
[
0
]
time_out
=
max_time_out
if
time_out
is
None
:
time_out
=
float
(
o
.
_time_out
)
/
1000.0
else
:
time_out
/=
1000.0
inter
=
float
(
o
.
_sleep_inter
)
/
1000.0
start
=
time
.
time
()
last_print_time
=
start
while
True
:
try
:
return
f
(
*
args
,
**
kwargs
)
#important: only ExecuteError need to retry
except
ExecuteError
as
e
:
if
time
.
time
()
-
start
>=
time_out
:
raise
FSTimeOut
(
"args:{} timeout:{}"
.
format
(
args
,
time
.
time
()
-
start
))
time
.
sleep
(
inter
)
if
time
.
time
()
-
last_print_time
>
30
:
print
(
"hadoop operator timeout:args:{} timeout:{}"
.
format
(
args
,
time
.
time
()
-
start
))
last_print_time
=
time
.
time
()
return
handler
return
decorator
class
HDFSClient
(
FS
):
def
__init__
(
self
,
...
...
@@ -216,7 +273,8 @@ class HDFSClient(FS):
if
configs
:
for
k
,
v
in
six
.
iteritems
(
configs
):
self
.
pre_commands
.
append
(
'-D%s=%s'
%
(
k
,
v
))
config_command
=
'-D%s=%s'
%
(
k
,
v
)
self
.
pre_commands
.
append
(
config_command
)
self
.
_time_out
=
time_out
self
.
_sleep_inter
=
sleep_inter
...
...
@@ -225,10 +283,22 @@ class HDFSClient(FS):
r
'\s?responseErrorMsg\s?\:.*, errorCode\:\s?[0-9]+, path\:'
)
def
_run_cmd
(
self
,
cmd
,
redirect_stderr
=
False
):
ret
,
output
=
fluid
.
core
.
shell_execute_cmd
(
cmd
,
0
,
0
,
redirect_stderr
)
return
int
(
ret
),
output
.
splitlines
()
exe_cmd
=
"{} -{}"
.
format
(
self
.
_base_cmd
,
cmd
)
ret
,
output
=
core
.
shell_execute_cmd
(
exe_cmd
,
0
,
0
,
redirect_stderr
)
ret
=
int
(
ret
)
if
ret
==
134
:
raise
FSShellCmdAborted
(
cmd
)
return
ret
,
output
.
splitlines
()
@
_handle_errors
()
def
list_dirs
(
self
,
fs_path
):
if
not
self
.
is_exist
(
fs_path
):
return
[]
@
_handle_errors
dirs
,
files
=
self
.
_ls_dir
(
fs_path
)
return
dirs
@
_handle_errors
()
def
ls_dir
(
self
,
fs_path
):
"""
list directory under fs_path, and only give the pure name, not include the fs_path
...
...
@@ -236,11 +306,14 @@ class HDFSClient(FS):
if
not
self
.
is_exist
(
fs_path
):
return
[],
[]
cmd
=
"{} -ls {}"
.
format
(
self
.
_base_cmd
,
fs_path
)
return
self
.
_ls_dir
(
fs_path
)
def
_ls_dir
(
self
,
fs_path
):
cmd
=
"ls {}"
.
format
(
fs_path
)
ret
,
lines
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
raise
ExecuteError
(
cmd
)
dirs
=
[]
files
=
[]
...
...
@@ -249,9 +322,6 @@ class HDFSClient(FS):
if
len
(
arr
)
!=
8
:
continue
if
fs_path
not
in
arr
[
7
]:
continue
p
=
PurePosixPath
(
arr
[
7
])
if
arr
[
0
][
0
]
==
'd'
:
dirs
.
append
(
p
.
name
)
...
...
@@ -268,18 +338,20 @@ class HDFSClient(FS):
return
None
@
_handle_errors
@
_handle_errors
()
def
is_dir
(
self
,
fs_path
):
if
not
self
.
is_exist
(
fs_path
):
return
False
cmd
=
"{} -test -d {}"
.
format
(
self
.
_base_cmd
,
fs_path
,
redirect_stderr
=
True
)
return
self
.
_is_dir
(
fs_path
)
def
_is_dir
(
self
,
fs_path
):
cmd
=
"test -d {}"
.
format
(
fs_path
,
redirect_stderr
=
True
)
ret
,
lines
=
self
.
_run_cmd
(
cmd
)
if
ret
:
# other error
if
self
.
_test_match
(
lines
)
!=
None
:
raise
ExecuteError
if
self
.
_test_match
(
lines
):
raise
ExecuteError
(
cmd
)
return
False
...
...
@@ -289,94 +361,155 @@ class HDFSClient(FS):
if
not
self
.
is_exist
(
fs_path
):
return
False
return
not
self
.
is_dir
(
fs_path
)
return
not
self
.
_
is_dir
(
fs_path
)
@
_handle_errors
@
_handle_errors
()
def
is_exist
(
self
,
fs_path
):
cmd
=
"
{} -ls {} "
.
format
(
self
.
_base_cmd
,
fs_path
)
cmd
=
"
ls {} "
.
format
(
fs_path
)
ret
,
out
=
self
.
_run_cmd
(
cmd
,
redirect_stderr
=
True
)
if
ret
!=
0
:
for
l
in
out
:
if
"No such file or directory"
in
l
:
return
False
raise
ExecuteError
raise
ExecuteError
(
cmd
)
return
True
@
_handle_errors
# can't retry
def
upload
(
self
,
local_path
,
fs_path
):
if
self
.
is_exist
(
fs_path
):
raise
FSFileExistsError
raise
FSFileExistsError
(
"{} exists"
.
format
(
fs_path
))
local
=
LocalFS
()
if
not
local
.
is_exist
(
local_path
):
raise
FSFileNotExistsError
cmd
=
"{} -put {} {}"
.
format
(
self
.
_base_cmd
,
local_path
,
fs_path
)
ret
,
lines
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
@
_handle_errors
raise
FSFileNotExistsError
(
"{} not exists"
.
format
(
local_path
))
return
self
.
_try_upload
(
local_path
,
fs_path
)
@
_handle_errors
()
def
_try_upload
(
self
,
local_path
,
fs_path
):
cmd
=
"put {} {}"
.
format
(
local_path
,
fs_path
)
ret
=
0
try
:
ret
,
lines
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
(
cmd
)
except
Exception
as
e
:
self
.
delete
(
fs_path
)
raise
e
# can't retry
def
download
(
self
,
fs_path
,
local_path
):
if
self
.
is_exist
(
local_path
):
raise
FSFileExistsError
raise
FSFileExistsError
(
"{} exists"
.
format
(
local_path
))
if
not
self
.
is_exist
(
fs_path
):
raise
FSFileNotExistsError
cmd
=
"{} -get {} {}"
.
format
(
self
.
_base_cmd
,
fs_path
,
local_path
)
ret
,
lines
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
@
_handle_errors
raise
FSFileNotExistsError
(
"{} not exits"
.
format
(
fs_path
))
return
self
.
_try_download
(
fs_path
,
local_path
)
@
_handle_errors
()
def
_try_download
(
self
,
fs_path
,
local_path
):
cmd
=
"get {} {}"
.
format
(
fs_path
,
local_path
)
ret
=
0
try
:
ret
,
lines
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
(
cmd
)
except
Exception
as
e
:
local_fs
=
LocalFS
()
local_fs
.
delete
(
local_path
)
raise
e
@
_handle_errors
()
def
mkdirs
(
self
,
fs_path
):
if
self
.
is_exist
(
fs_path
):
return
cmd
=
"{} -mkdir {}"
.
format
(
self
.
_base_cmd
,
fs_path
)
ret
,
lines
=
self
.
_run_cmd
(
cmd
)
out_hdfs
=
False
cmd
=
"mkdir {} "
.
format
(
fs_path
)
ret
,
out
=
self
.
_run_cmd
(
cmd
,
redirect_stderr
=
True
)
if
ret
!=
0
:
raise
ExecuteError
for
l
in
out
:
if
"No such file or directory"
in
l
:
out_hdfs
=
True
break
if
not
out_hdfs
:
raise
ExecuteError
(
cmd
)
if
out_hdfs
and
not
self
.
is_exist
(
fs_path
):
cmd
=
"mkdir -p {}"
.
format
(
fs_path
)
ret
,
lines
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
(
cmd
)
def
mv
(
self
,
fs_src_path
,
fs_dst_path
,
overwrite
=
False
,
test_exists
=
True
):
if
overwrite
and
self
.
is_exist
(
fs_dst_path
):
self
.
delete
(
fs_dst_path
)
@
_handle_errors
def
mv
(
self
,
fs_src_path
,
fs_dst_path
,
test_exists
=
True
):
if
test_exists
:
if
not
self
.
is_exist
(
fs_src_path
):
raise
FSFileNotExistsError
raise
FSFileNotExistsError
(
"{} is not exists"
.
format
(
fs_src_path
))
if
self
.
is_exist
(
fs_dst_path
):
raise
FSFileExistsError
raise
FSFileExistsError
(
"{} exists already"
.
format
(
fs_src_path
,
fs_dst_path
,
fs_dst_path
))
return
self
.
_try_mv
(
fs_src_path
,
fs_dst_path
)
@
_handle_errors
()
def
_try_mv
(
self
,
fs_src_path
,
fs_dst_path
):
cmd
=
"mv {} {}"
.
format
(
fs_src_path
,
fs_dst_path
)
ret
=
0
try
:
ret
,
_
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
(
cmd
)
except
Exception
as
e
:
if
not
self
.
is_exist
(
fs_src_path
)
and
\
self
.
is_exist
(
fs_dst_path
):
return
raise
e
cmd
=
"{} -mv {} {}"
.
format
(
self
.
_base_cmd
,
fs_src_path
,
fs_dst_path
)
ret
,
_
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
@
_handle_errors
def
_rmr
(
self
,
fs_path
):
cmd
=
"
{} -rmr {}"
.
format
(
self
.
_base_cmd
,
fs_path
)
cmd
=
"
rmr {}"
.
format
(
fs_path
)
ret
,
_
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
raise
ExecuteError
(
cmd
)
@
_handle_errors
def
_rm
(
self
,
fs_path
):
cmd
=
"
{} -rm {}"
.
format
(
self
.
_base_cmd
,
fs_path
)
cmd
=
"
rm {}"
.
format
(
fs_path
)
ret
,
_
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
raise
ExecuteError
(
cmd
)
@
_handle_errors
()
def
delete
(
self
,
fs_path
):
if
not
self
.
is_exist
(
fs_path
):
return
is_dir
=
self
.
is_dir
(
fs_path
)
is_dir
=
self
.
_
is_dir
(
fs_path
)
if
is_dir
:
return
self
.
_rmr
(
fs_path
)
return
self
.
_rm
(
fs_path
)
def
touch
(
self
,
fs_path
,
exist_ok
=
True
):
if
self
.
is_exist
(
fs_path
):
if
exist_ok
:
return
raise
FSFileExistsError
return
self
.
_touchz
(
fs_path
)
@
_handle_errors
()
def
_touchz
(
self
,
fs_path
):
cmd
=
"touchz {}"
.
format
(
fs_path
)
ret
,
_
=
self
.
_run_cmd
(
cmd
)
if
ret
!=
0
:
raise
ExecuteError
def
need_upload_download
(
self
):
return
True
python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
浏览文件 @
a7c52100
...
...
@@ -24,7 +24,6 @@ from threading import Thread, current_thread
from
contextlib
import
contextmanager
from
paddle.fluid
import
unique_name
,
compiler
from
paddle.fluid.incubate.fleet.utils.hdfs
import
HDFSClient
from
.checkpoint_saver
import
SerializableBase
,
CheckpointSaver
,
PaddleModel
from
paddle.fluid.framework
import
in_dygraph_mode
,
Program
...
...
@@ -306,6 +305,7 @@ class TrainEpochRange(SerializableBase):
if
self
.
_checker
.
ce_test
:
config
=
None
from
paddle.fleet.utils.fs
import
HDFSClient
self
.
_hdfs
=
HDFSClient
(
self
.
_checker
.
hdfs_home
,
config
)
self
.
_cper
=
CheckpointSaver
(
self
.
_hdfs
)
...
...
python/paddle/fluid/incubate/checkpoint/checkpoint_saver.py
浏览文件 @
a7c52100
...
...
@@ -12,8 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from
..fleet.utils.fs
import
FS
,
LocalFS
from
..fleet.utils.hdfs
import
HDFSClient
from
...compiler
import
CompiledProgram
...
...
@@ -81,6 +79,7 @@ class CheckpointSaver(object):
tmp_path
=
"{}.tmp"
.
format
(
real_path
)
saved_path
=
tmp_path
from
paddle.fleet.utils.fs
import
LocalFS
local_fs
=
LocalFS
()
cache_path
=
None
...
...
@@ -121,7 +120,6 @@ class CheckpointSaver(object):
Deserialize objects in slists from path
Return really load path
"""
if
checkpoint_no
is
None
:
max_no
=
self
.
_get_last_checkpoint_no
(
path
)
...
...
@@ -136,6 +134,7 @@ class CheckpointSaver(object):
assert
isinstance
(
checkpoint_no
,
int
)
assert
checkpoint_no
>=
0
from
paddle.fleet.utils.fs
import
LocalFS
local_fs
=
LocalFS
()
if
self
.
_fs
.
need_upload_download
():
cache_path
=
"{}/{}.{}.load_cache"
.
format
(
...
...
python/paddle/fluid/incubate/fleet/collective/__init__.py
浏览文件 @
a7c52100
...
...
@@ -26,7 +26,6 @@ from paddle.fluid.incubate.fleet.base.fleet_base import Mode
from
paddle.fluid.incubate.fleet.base.fleet_base
import
DistributedOptimizer
from
paddle.fluid
import
compiler
from
paddle.fluid.incubate.fleet.utils.fs
import
LocalFS
from
paddle.fluid.incubate.checkpoint.checkpoint_saver
import
PaddleModel
,
CheckpointSaver
import
os
...
...
@@ -143,14 +142,13 @@ class Collective(Fleet):
path
,
trainer_id
,
train_status
,
fs
,
main_program
=
None
,
fs
=
LocalFS
(),
local_cache_path
=
".cache"
,
remain_all_checkpoint
=
True
):
"""
This function save persistables and current epoch num to path.
"""
if
main_program
==
None
:
main_program
=
self
.
_transpiled_program
...
...
@@ -173,8 +171,8 @@ class Collective(Fleet):
path
,
trainer_id
,
train_status
,
fs
,
main_program
=
None
,
fs
=
LocalFS
(),
local_cache_path
=
".cache"
,
ignore_empty
=
True
):
"""
...
...
python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
浏览文件 @
a7c52100
...
...
@@ -20,8 +20,7 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
import
os
import
sys
from
paddle.fluid.incubate.fleet.utils.fs
import
LocalFS
from
paddle.fluid.incubate.fleet.utils.hdfs
import
HDFSClient
from
paddle.fleet.utils.fs
import
LocalFS
,
HDFSClient
import
paddle.fluid.incubate.checkpoint.auto_checkpoint
as
acp
from
paddle.fluid.incubate.checkpoint.checkpoint_saver
import
PaddleModel
from
paddle.fluid.framework
import
program_guard
...
...
python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py
浏览文件 @
a7c52100
...
...
@@ -20,8 +20,7 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
import
os
import
sys
from
paddle.fluid.incubate.fleet.utils.fs
import
LocalFS
from
paddle.fluid.incubate.fleet.utils.hdfs
import
HDFSClient
from
paddle.fleet.utils.fs
import
LocalFS
,
HDFSClient
import
paddle.fluid.incubate.checkpoint.auto_checkpoint
as
acp
from
paddle.fluid.incubate.checkpoint.checkpoint_saver
import
PaddleModel
from
paddle.fluid.framework
import
program_guard
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录