Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
24a063f6
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2305
Star
20932
Fork
5423
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
24a063f6
编写于
4月 03, 2020
作者:
G
gongweibao
提交者:
GitHub
4月 03, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add fleet checkpoint on local fs and remote fs(such as hdfs) for EDL (#22586)
上级
0c23e3ff
变更
13
隐藏空白更改
内联
并排
Showing
13 changed file
with
1130 addition
and
214 deletion
+1130
-214
paddle/fluid/framework/io/CMakeLists.txt
paddle/fluid/framework/io/CMakeLists.txt
+1
-1
paddle/fluid/framework/io/shell.cc
paddle/fluid/framework/io/shell.cc
+33
-6
paddle/fluid/framework/io/shell.h
paddle/fluid/framework/io/shell.h
+6
-1
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+4
-2
python/paddle/distributed/cloud_utils.py
python/paddle/distributed/cloud_utils.py
+79
-0
python/paddle/distributed/fs_wrapper.py
python/paddle/distributed/fs_wrapper.py
+223
-0
python/paddle/distributed/launch.py
python/paddle/distributed/launch.py
+74
-200
python/paddle/distributed/utils.py
python/paddle/distributed/utils.py
+424
-0
python/paddle/fluid/incubate/fleet/collective/__init__.py
python/paddle/fluid/incubate/fleet/collective/__init__.py
+202
-2
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+2
-0
python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py
python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py
+77
-0
python/paddle/fluid/tests/unittests/test_launch.sh
python/paddle/fluid/tests/unittests/test_launch.sh
+4
-2
python/requirements.txt
python/requirements.txt
+1
-0
未找到文件。
paddle/fluid/framework/io/CMakeLists.txt
浏览文件 @
24a063f6
cc_library
(
fs SRCS fs.cc DEPS string_helper glog boost
)
cc_library
(
shell SRCS shell.cc DEPS string_helper glog
)
cc_library
(
shell SRCS shell.cc DEPS string_helper glog
timer
)
cc_test
(
test_fs SRCS test_fs.cc DEPS fs shell
)
paddle/fluid/framework/io/shell.cc
浏览文件 @
24a063f6
...
...
@@ -13,6 +13,8 @@
// limitations under the License.
#include "paddle/fluid/framework/io/shell.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/timer.h"
namespace
paddle
{
namespace
framework
{
...
...
@@ -296,23 +298,48 @@ std::pair<std::shared_ptr<FILE>, std::shared_ptr<FILE>> shell_p2open(
#endif
}
std
::
string
shell_get_command_output
(
const
std
::
string
&
cmd
)
{
std
::
string
shell_get_command_output
(
const
std
::
string
&
cmd
,
int
time_out
,
int
sleep_inter
,
bool
print_cmd
)
{
#if defined _WIN32 || defined __APPLE__
return
""
;
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"This function(shell_get_command_output) is not implemented under _WIN32 "
"or __APPLE__."
));
#else
int
err_no
=
0
;
platform
::
Timer
timer
;
do
{
if
(
print_cmd
)
{
LOG
(
INFO
)
<<
"exec cmd:["
<<
cmd
<<
"]"
;
}
err_no
=
0
;
std
::
shared_ptr
<
FILE
>
pipe
=
shell_popen
(
cmd
,
"r"
,
&
err_no
);
string
::
LineFileReader
reader
;
if
(
reader
.
getdelim
(
&*
pipe
,
0
))
{
pipe
=
nullptr
;
if
(
err_no
==
0
)
{
char
*
buf
=
reader
.
getdelim
(
&*
pipe
,
0
);
if
(
err_no
==
0
)
{
if
(
buf
)
{
return
reader
.
get
();
}
return
""
;
}
if
(
sleep_inter
>
0
)
{
usleep
(
sleep_inter
);
}
}
while
(
err_no
==
-
1
);
timer
.
Pause
();
if
(
time_out
>
0
&&
timer
.
ElapsedMS
()
>=
time_out
)
{
PADDLE_THROW
(
paddle
::
platform
::
errors
::
ExecutionTimeout
(
"shell_get_command_output execute error errno:%d and try until "
"timeout."
,
errno
));
return
""
;
}
timer
.
Resume
();
pipe
=
nullptr
;
}
while
(
err_no
);
return
""
;
#endif
}
...
...
paddle/fluid/framework/io/shell.h
浏览文件 @
24a063f6
...
...
@@ -65,7 +65,12 @@ inline void shell_execute(const std::string& cmd) {
}
while
(
err_no
==
-
1
);
}
extern
std
::
string
shell_get_command_output
(
const
std
::
string
&
cmd
);
// timeout:ms, default -1 means forever.
// sleep_inter:ms, default -1 means not sleep.
extern
std
::
string
shell_get_command_output
(
const
std
::
string
&
cmd
,
int
time_out
=
-
1
,
int
sleep_inter
=
-
1
,
bool
print_cmd
=
false
);
}
// namespace framework
}
// namespace paddle
paddle/fluid/pybind/pybind.cc
浏览文件 @
24a063f6
...
...
@@ -1494,8 +1494,10 @@ All parameter, weight, gradient are variables in Paddle.
m
.
def
(
"is_compiled_with_mkldnn"
,
IsCompiledWithMKLDNN
);
m
.
def
(
"is_compiled_with_brpc"
,
IsCompiledWithBrpc
);
m
.
def
(
"is_compiled_with_dist"
,
IsCompiledWithDIST
);
m
.
def
(
"run_cmd"
,
[](
const
std
::
string
&
cmd
)
->
const
std
::
string
{
return
paddle
::
framework
::
shell_get_command_output
(
cmd
);
m
.
def
(
"run_cmd"
,
[](
const
std
::
string
&
cmd
,
int
time_out
=
-
1
,
int
sleep_inter
=
-
1
)
->
const
std
::
string
{
return
paddle
::
framework
::
shell_get_command_output
(
cmd
,
time_out
,
sleep_inter
);
});
#ifdef PADDLE_WITH_CUDA
m
.
def
(
"is_float16_supported"
,
[](
const
platform
::
CUDAPlace
&
place
)
->
bool
{
...
...
python/paddle/distributed/cloud_utils.py
0 → 100644
浏览文件 @
24a063f6
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
utils
import
get_cluster
,
logger
import
os
def
get_cloud_cluster
(
args_node_ips
,
args_node_ip
,
args_port
,
selected_gpus
):
"""
args_node_ips, args_node_ip:string
"""
#you can automatically get ip info while using paddlecloud multi nodes mode.
node_ips
=
os
.
getenv
(
"PADDLE_TRAINERS"
)
assert
node_ips
is
not
None
,
"PADDLE_TRAINERS should not be None"
node_ip
=
os
.
getenv
(
"POD_IP"
)
assert
node_ip
is
not
None
,
"POD_IP should not be None"
node_rank
=
os
.
getenv
(
"PADDLE_TRAINER_ID"
)
assert
node_rank
is
not
None
,
"PADDLE_TRAINER_ID should not be None"
node_ips
=
node_ips
.
split
(
","
)
num_nodes
=
len
(
node_ips
)
node_rank
=
int
(
node_rank
)
if
node_ip
!=
"127.0.0.1"
and
node_ip
!=
args_node_ip
:
logger
.
warning
(
"Please NOTE: When using paddlecloud, node_ip is
\
automatically got from POD_IP. Your input node_ip: {} doesn't equals to
\
node_ip: {} from paddlecloud environment."
.
format
(
args_node_ip
,
node_ip
))
if
args_node_ips
!=
"127.0.0.1"
and
args_node_ips
!=
","
.
join
(
node_ips
):
logger
.
warning
(
"Please NOTE: When using paddlecloud, cluster_node_ips is
\
automatically got from PADDLE_TRAINERS(multi nodes) or POD_IP(single node).
\
Your input cluster_node_ips: {} doesn't equals to IPs: {} from
\
paddlecloud environment."
.
format
(
args_node_ips
,
node_ips
))
started_port
=
args_port
print
(
"num_nodes:"
,
num_nodes
)
if
num_nodes
>
1
:
try
:
paddle_port
=
int
(
os
.
getenv
(
"PADDLE_PORT"
,
""
))
paddle_port_num
=
int
(
os
.
getenv
(
"TRAINER_PORTS_NUM"
,
""
))
if
paddle_port_num
>=
len
(
selected_gpus
)
and
paddle_port
!=
args_port
:
logger
.
warning
(
"Use Cloud specified port:{}."
.
format
(
paddle_port
))
started_port
=
paddle_port
except
Exception
as
e
:
print
(
e
)
pass
if
started_port
is
None
:
started_port
=
6170
logger
.
debug
(
"parsed from args:node_ips:{}
\
node_ip:{} node_rank:{} started_port:{}"
.
format
(
node_ips
,
node_ip
,
node_rank
,
started_port
))
ports
=
[
x
for
x
in
range
(
started_port
,
started_port
+
len
(
selected_gpus
))]
cluster
,
pod
=
get_cluster
(
node_ips
,
node_ip
,
ports
,
selected_gpus
)
return
cluster
,
cluster
.
pods
[
node_rank
]
def
get_trainers_num
():
return
int
(
os
.
getenv
(
"PADDLE_TRAINERS_NUM"
,
"1"
))
python/paddle/distributed/fs_wrapper.py
0 → 100644
浏览文件 @
24a063f6
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle.fluid
as
fluid
import
sys
import
abc
import
os
from
pathlib
import
PurePosixPath
import
shutil
class
FS
(
object
):
@
abc
.
abstractmethod
def
list_dirs
(
self
,
fs_path
):
pass
@
abc
.
abstractmethod
def
ls_dir
(
self
,
fs_path
):
pass
@
abc
.
abstractmethod
def
stat
(
self
,
fs_path
):
pass
@
abc
.
abstractmethod
def
upload
(
self
,
local_path
,
fs_path
):
pass
@
abc
.
abstractmethod
def
download
(
self
,
fs_path
,
local_path
):
pass
@
abc
.
abstractmethod
def
mkdir
(
self
,
fs_path
):
pass
@
abc
.
abstractmethod
def
mv
(
self
,
fs_src_path
,
fs_dst_path
):
pass
@
abc
.
abstractmethod
def
rmr
(
self
,
fs_path
):
pass
@
abc
.
abstractmethod
def
rm
(
self
,
fs_path
):
pass
@
abc
.
abstractmethod
def
delete
(
self
,
fs_path
):
pass
@
abc
.
abstractmethod
def
need_upload_download
(
self
):
pass
class
LocalFS
(
FS
):
def
list_dirs
(
self
,
fs_path
):
if
not
self
.
stat
(
fs_path
):
return
[]
return
[
f
for
f
in
os
.
listdir
(
fs_path
)
if
os
.
path
.
isdir
(
fs_path
+
"/"
+
f
)
]
def
ls_dir
(
self
,
fs_path
):
return
[
f
for
f
in
os
.
listdir
(
fs_path
)]
def
stat
(
self
,
fs_path
):
return
os
.
path
.
exists
(
fs_path
)
def
mkdir
(
self
,
fs_path
):
assert
not
os
.
path
.
isfile
(
fs_path
),
"{} is already a file"
.
format
(
fs_path
)
os
.
system
(
"mkdir -p {}"
.
format
(
fs_path
))
def
mv
(
self
,
fs_src_path
,
fs_dst_path
):
os
.
rename
(
fs_src_path
,
fs_dst_path
)
def
rmr
(
self
,
fs_path
):
shutil
.
rmtree
(
fs_path
)
def
rm
(
self
,
fs_path
):
os
.
remove
(
fs_path
)
def
delete
(
self
,
fs_path
):
if
not
self
.
stat
(
fs_path
):
return
if
os
.
path
.
isfile
(
fs_path
):
return
self
.
rm
(
fs_path
)
return
self
.
rmr
(
fs_path
)
def
need_upload_download
(
self
):
return
False
class
BDFS
(
FS
):
def
__init__
(
self
,
hdfs_name
,
hdfs_ugi
,
time_out
=
20
*
60
*
1000
,
sleep_inter
=
1000
):
self
.
_base_cmd
=
"hadoop fs -Dfs.default.name=
\"
{}
\"
-Dhadoop.job.ugi=
\"
{}
\"
"
.
format
(
hdfs_name
,
hdfs_ugi
)
self
.
_time_out
=
time_out
self
.
_sleep_inter
=
sleep_inter
def
_run_cmd
(
self
,
cmd
):
ret
=
fluid
.
core
.
run_cmd
(
cmd
,
self
.
_time_out
,
self
.
_sleep_inter
)
if
len
(
ret
)
<=
0
:
return
[]
lines
=
ret
.
splitlines
()
return
lines
def
list_dirs
(
self
,
fs_path
):
if
not
self
.
stat
(
fs_path
):
return
[]
dirs
,
_
=
self
.
ls_dir
(
fs_path
)
return
dirs
def
ls_dir
(
self
,
fs_path
):
"""
list directory under fs_path, and only give the pure name, not include the fs_path
"""
cmd
=
"{} -ls {}"
.
format
(
self
.
_base_cmd
,
fs_path
)
lines
=
self
.
_run_cmd
(
cmd
)
dirs
=
[]
files
=
[]
for
line
in
lines
:
arr
=
line
.
split
()
if
len
(
arr
)
!=
8
:
continue
if
fs_path
not
in
arr
[
7
]:
continue
p
=
PurePosixPath
(
arr
[
7
])
if
arr
[
0
][
0
]
==
'd'
:
dirs
.
append
(
p
.
name
)
else
:
files
.
append
(
p
.
name
)
return
dirs
,
files
def
is_dir
(
self
,
fs_path
):
cmd
=
"{} -test -d {} ; echo $?"
.
format
(
self
.
_base_cmd
,
fs_path
)
test
=
self
.
_run_cmd
(
cmd
)
if
test
[
0
].
strip
()
==
"0"
:
return
True
return
False
def
stat
(
self
,
fs_path
):
cmd
=
"{} -test -e {} ; echo $?"
.
format
(
self
.
_base_cmd
,
fs_path
)
test
=
self
.
_run_cmd
(
cmd
)
if
test
[
0
].
strip
()
==
"0"
:
return
True
return
False
def
upload
(
self
,
local_path
,
fs_path
):
cmd
=
"{} -put {} {}"
.
format
(
self
.
_base_cmd
,
local_path
,
fs_path
)
fluid
.
core
.
run_cmd
(
cmd
,
self
.
_time_out
,
self
.
_sleep_inter
)
def
download
(
self
,
fs_path
,
local_path
):
cmd
=
"{} -get {} {}/"
.
format
(
self
.
_base_cmd
,
fs_path
,
local_path
)
fluid
.
core
.
run_cmd
(
cmd
,
self
.
_time_out
,
self
.
_sleep_inter
)
def
mkdir
(
self
,
fs_path
):
if
not
self
.
stat
(
fs_path
):
cmd
=
"{} -mkdir {}"
.
format
(
self
.
_base_cmd
,
fs_path
)
fluid
.
core
.
run_cmd
(
cmd
,
self
.
_time_out
,
self
.
_sleep_inter
)
def
mv
(
self
,
fs_src_path
,
fs_dst_path
):
cmd
=
"{} -mv {} {}"
.
format
(
self
.
_base_cmd
,
fs_src_path
,
fs_dst_path
)
fluid
.
core
.
run_cmd
(
cmd
,
self
.
_time_out
,
self
.
_sleep_inter
)
def
rmr
(
self
,
fs_path
):
if
not
self
.
stat
(
fs_path
):
return
cmd
=
"{} -rmr {}"
.
format
(
self
.
_base_cmd
,
fs_path
)
return
fluid
.
core
.
run_cmd
(
cmd
,
self
.
_time_out
,
self
.
_sleep_inter
)
def
rm
(
self
,
fs_path
):
if
not
self
.
stat
(
fs_path
):
return
cmd
=
"{} -rm {}"
.
format
(
self
.
_base_cmd
,
fs_path
)
return
fluid
.
core
.
run_cmd
(
cmd
,
self
.
_time_out
,
self
.
_sleep_inter
)
def
delete
(
self
,
fs_path
):
if
not
self
.
stat
(
fs_path
):
return
is_dir
=
self
.
is_dir
(
fs_path
)
if
is_dir
:
return
self
.
rmr
(
fs_path
)
return
self
.
rm
(
fs_path
)
def
need_upload_download
(
self
):
return
True
python/paddle/distributed/launch.py
浏览文件 @
24a063f6
...
...
@@ -36,7 +36,6 @@ launch a process on each of the given gpu card.
"""
from
__future__
import
print_function
import
logging
import
sys
from
sys
import
version
import
subprocess
...
...
@@ -45,17 +44,11 @@ import time
import
six
import
copy
from
argparse
import
ArgumentParser
,
REMAINDER
import
paddle
import
paddle.fluid
as
fluid
from
contextlib
import
closing
import
socket
logger
=
logging
.
getLogger
()
logger
.
setLevel
(
logging
.
INFO
)
log_handler
=
logging
.
StreamHandler
()
log_format
=
logging
.
Formatter
(
'%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
)
log_handler
.
setFormatter
(
log_format
)
logger
.
addHandler
(
log_handler
)
from
paddle.distributed.utils
import
*
import
paddle.distributed.cloud_utils
as
cloud_utils
def
_print_arguments
(
args
):
...
...
@@ -65,32 +58,6 @@ def _print_arguments(args):
print
(
"------------------------------------------------"
)
def
find_free_ports
(
num
):
def
__free_port
():
with
closing
(
socket
.
socket
(
socket
.
AF_INET
,
socket
.
SOCK_STREAM
))
as
s
:
s
.
bind
((
''
,
0
))
return
s
.
getsockname
()[
1
]
port_set
=
set
()
step
=
0
while
True
:
port
=
__free_port
()
if
port
not
in
port_set
:
port_set
.
add
(
port
)
if
len
(
port_set
)
>=
num
:
return
port_set
step
+=
1
if
step
>
100
:
print
(
"can't find avilable port and use the specified static port now!"
)
return
None
return
None
def
_parse_args
():
"""
Helper function parsing the command line options
...
...
@@ -146,6 +113,12 @@ POD_IP (current node ip address, not needed for local training)
"each process is bound to a single GPU. And if it's not set, this module will use all the gpu cards for training."
)
parser
.
add_argument
(
"--log_level"
,
type
=
int
,
default
=
20
,
# logging.INFO, details are here:https://docs.python.org/3/library/logging.html#levels
help
=
"Logging level, default is logging.INFO"
)
parser
.
add_argument
(
"--log_dir"
,
type
=
str
,
...
...
@@ -166,196 +139,97 @@ POD_IP (current node ip address, not needed for local training)
return
parser
.
parse_args
()
def
terminate_procs
(
procs
):
for
p
in
procs
:
if
p
.
poll
()
is
None
:
p
.
terminate
()
def
get_cluster_from_args
(
args
,
selected_gpus
):
node_ips
=
[
x
.
strip
()
for
x
in
args
.
cluster_node_ips
.
split
(
','
)]
node_ip
=
args
.
node_ip
node_rank
=
node_ips
.
index
(
node_ip
)
logger
.
debug
(
"parsed from args:node_ips:{} node_ip:{} node_rank:{}"
.
format
(
node_ips
,
node_ip
,
node_rank
))
free_ports
=
None
if
not
args
.
use_paddlecloud
and
len
(
node_ips
)
<=
1
and
args
.
started_port
is
None
:
free_ports
=
find_free_ports
(
len
(
selected_gpus
))
if
free_ports
is
not
None
:
free_ports
=
list
(
free_ports
)
else
:
free_ports
=
[
x
for
x
in
range
(
args
.
started_port
,
args
.
started_port
+
len
(
selected_gpus
))
]
def
start_procs
(
args
):
"""
"""
default_env
=
os
.
environ
.
copy
()
return
get_cluster
(
node_ips
,
node_ip
,
free_ports
,
selected_gpus
)
current_node_ip
=
args
.
node_ip
node_ips
=
[
x
.
strip
()
for
x
in
args
.
cluster_node_ips
.
split
(
','
)]
node_id
=
node_ips
.
index
(
current_node_ip
)
if
args
.
use_paddlecloud
:
trainer_nums
=
int
(
os
.
getenv
(
"PADDLE_TRAINERS_NUM"
,
"1"
))
if
trainer_nums
!=
1
:
#you can automatically get ip info while using paddlecloud multi nodes mode.
current_node_ip
=
os
.
getenv
(
"POD_IP"
)
assert
current_node_ip
is
not
None
,
"POD_IP should not be None"
node_ips
=
os
.
getenv
(
"PADDLE_TRAINERS"
)
assert
node_ips
is
not
None
,
"PADDLE_TRAINERS should not be None"
node_ips
=
node_ips
.
split
(
","
)
node_id
=
os
.
getenv
(
"PADDLE_TRAINER_ID"
)
assert
node_id
is
not
None
,
"PADDLE_TRAINER_ID should not be None"
node_id
=
int
(
node_id
)
if
args
.
node_ip
!=
"127.0.0.1"
and
current_node_ip
!=
args
.
node_ip
:
logger
.
warning
(
"Please NOTE: When using paddlecloud, current_node_ip is
\
automatically got from POD_IP. Your input node_ip: {} doesn't equals to
\
current_node_ip: {} from paddlecloud environment."
.
format
(
args
.
node_ip
,
current_node_ip
))
if
args
.
cluster_node_ips
!=
"127.0.0.1"
and
args
.
cluster_node_ips
!=
","
.
join
(
node_ips
):
logger
.
warning
(
"Please NOTE: When using paddlecloud, cluster_node_ips is
\
automatically got from PADDLE_TRAINERS(multi nodes) or POD_IP(single node).
\
Your input cluster_node_ips: {} doesn't equals to IPs: {} from
\
paddlecloud environment."
.
format
(
args
.
cluster_node_ips
,
node_ips
))
num_nodes
=
len
(
node_ips
)
if
args
.
selected_gpus
is
None
:
def
get_gpus
(
selected_gpus
):
if
selected_gpus
is
None
:
gpus_num
=
fluid
.
core
.
get_cuda_device_count
()
selected_gpus
=
[
str
(
x
)
for
x
in
range
(
0
,
gpus_num
)]
else
:
cuda_visible_devices
=
os
.
getenv
(
"CUDA_VISIBLE_DEVICES"
)
if
cuda_visible_devices
is
None
or
cuda_visible_devices
==
""
:
selected_gpus
=
[
x
.
strip
()
for
x
in
args
.
selected_gpus
.
split
(
','
)]
selected_gpus
=
[
x
.
strip
()
for
x
in
selected_gpus
.
split
(
','
)]
else
:
# change selected_gpus into relative values
# e.g. CUDA_VISIBLE_DEVICES=4,5,6,7; args.selected_gpus=4,5,6,7;
# therefore selected_gpus=0,1,2,3
cuda_visible_devices_list
=
cuda_visible_devices
.
split
(
','
)
for
x
in
args
.
selected_gpus
.
split
(
','
):
for
x
in
selected_gpus
.
split
(
','
):
assert
x
in
cuda_visible_devices_list
,
"Can't find "
\
"your selected_gpus %s in CUDA_VISIBLE_DEVICES[%s]."
\
%
(
x
,
cuda_visible_devices
)
selected_gpus
=
[
cuda_visible_devices_list
.
index
(
x
.
strip
())
for
x
in
args
.
selected_gpus
.
split
(
','
)
for
x
in
selected_gpus
.
split
(
','
)
]
selected_gpus_num
=
len
(
selected_gpus
)
if
args
.
use_paddlecloud
and
num_nodes
>
1
:
cloud_paddle_port
=
os
.
getenv
(
"PADDLE_PORT"
,
""
)
cloud_paddle_port_num
=
os
.
getenv
(
"PADDLE_PORTS_NUM"
,
""
)
if
cloud_paddle_port
!=
""
and
cloud_paddle_port_num
!=
""
:
cloud_paddle_port_num
=
int
(
cloud_paddle_port_num
)
if
cloud_paddle_port_num
>=
selected_gpus_num
:
args
.
started_port
=
int
(
cloud_paddle_port
)
logger
.
warning
(
"Use Cloud specified port:{}."
.
format
(
cloud_paddle_port
))
free_ports
=
None
if
not
args
.
use_paddlecloud
and
num_nodes
<=
1
and
args
.
started_port
is
None
:
free_ports
=
find_free_ports
(
selected_gpus_num
)
if
free_ports
is
not
None
:
free_ports
=
list
(
free_ports
)
args
.
started_port
=
free_ports
[
0
]
return
selected_gpus
if
args
.
started_port
is
None
:
args
.
started_port
=
6170
if
free_ports
is
None
:
free_ports
=
[
x
for
x
in
range
(
args
.
started_port
,
args
.
started_port
+
selected_gpus_num
)
]
def
launch
(
args
)
:
# parse arguments, used for cloud-single-machine and local
selected_gpus
=
get_gpus
(
args
.
selected_gpus
)
trainers_num
=
cloud_utils
.
get_trainers_num
()
logger
.
debug
(
"parsed from args trainerss_num:{} selected_gpus:{}"
.
format
(
trainers_num
,
selected_gpus
))
trainers_endpoints
=
""
for
ip
in
node_ips
:
for
i
in
range
(
0
,
selected_gpus_num
):
if
trainers_endpoints
!=
""
:
trainers_endpoints
+=
","
trainers_endpoints
+=
"%s:%d"
%
(
ip
,
free_ports
[
i
])
cluster
=
None
pod
=
None
nranks
=
num_nodes
*
selected_gpus_num
if
args
.
use_paddlecloud
and
trainers_num
!=
1
:
cluster
,
pod
=
cloud_utils
.
get_cloud_cluster
(
args
.
cluster_node_ips
,
args
.
node_ip
,
args
.
started_port
,
selected_gpus
)
logger
.
info
(
"get cluster from cloud:{}"
.
format
(
cluster
))
else
:
cluster
,
pod
=
get_cluster_from_args
(
args
,
selected_gpus
)
logger
.
info
(
"get cluster from args:{}"
.
format
(
cluster
))
if
args
.
print_config
:
print
(
"trainers_endpoints:"
,
trainers_endpoints
,
", node_id:"
,
node_id
,
", current_node_ip:"
,
current_node_ip
,
", num_nodes:"
,
num_nodes
,
", node_ips:"
,
node_ips
,
", nranks:"
,
nranks
)
current_env
=
copy
.
copy
(
default_env
)
#paddle broadcast ncclUniqueId use socket, and
#proxy maybe make trainers unreachable, so delete them.
#if we set them to "", grpc will log error message "bad uri"
#so just delete them.
current_env
.
pop
(
"http_proxy"
,
None
)
current_env
.
pop
(
"https_proxy"
,
None
)
procs
=
[]
log_fns
=
[]
cmds
=
[]
ranks
=
[]
for
i
in
range
(
0
,
selected_gpus_num
):
rank
=
(
node_id
*
selected_gpus_num
+
i
)
current_env
.
update
({
"FLAGS_selected_gpus"
:
"%s"
%
selected_gpus
[
i
],
"PADDLE_TRAINER_ID"
:
"%d"
%
rank
,
"PADDLE_CURRENT_ENDPOINT"
:
"%s:%d"
%
(
current_node_ip
,
free_ports
[
i
]),
"PADDLE_TRAINERS_NUM"
:
"%d"
%
nranks
,
"PADDLE_TRAINER_ENDPOINTS"
:
trainers_endpoints
})
cmd
=
[
sys
.
executable
,
"-u"
,
args
.
training_script
]
+
args
.
training_script_args
cmds
.
append
(
cmd
)
if
args
.
log_dir
is
not
None
:
os
.
system
(
"mkdir -p {}"
.
format
(
args
.
log_dir
))
fn
=
open
(
"%s/workerlog.%d"
%
(
args
.
log_dir
,
i
),
"w"
)
log_fns
.
append
(
fn
)
proc
=
subprocess
.
Popen
(
cmd
,
env
=
current_env
,
stdout
=
fn
,
stderr
=
fn
)
else
:
proc
=
subprocess
.
Popen
(
cmd
,
env
=
current_env
)
procs
.
append
(
proc
)
ranks
.
append
(
rank
)
try
:
alive
=
True
error
=
False
error_rank
=
[]
# wait all process finish or one error
while
alive
and
not
error
:
alive
=
False
for
rank
,
p
in
zip
(
ranks
,
procs
):
ret
=
p
.
poll
()
if
ret
is
None
:
alive
=
True
elif
ret
!=
0
:
error
=
True
error_rank
.
append
(
rank
)
time
.
sleep
(
1
)
if
error
:
terminate_procs
(
procs
)
exit
(
1
)
except
KeyboardInterrupt
:
logger
.
warning
(
"KeyboardInterrupt, exit"
)
terminate_procs
(
procs
)
raise
except
SystemExit
:
logger
.
error
(
"ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log."
.
format
(
nranks
,
error_rank
))
terminate_procs
(
procs
)
raise
except
:
logger
.
error
(
"ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log."
.
format
(
nranks
,
error_rank
))
terminate_procs
(
procs
)
raise
finally
:
for
fn
in
log_fns
:
fn
.
close
()
def
launch
():
procs
=
start_local_trainers
(
cluster
,
pod
,
training_script
=
args
.
training_script
,
training_script_args
=
args
.
training_script_args
,
log_dir
=
args
.
log_dir
)
while
True
:
alive
=
watch_local_trainers
(
procs
,
cluster
.
trainers_nranks
())
if
not
alive
:
logger
.
info
(
"Local procs complete, POD info:{}"
.
format
(
pod
))
break
time
.
sleep
(
3
)
if
__name__
==
"__main__"
:
args
=
_parse_args
()
logger
=
get_logger
(
args
.
log_level
)
if
args
.
print_config
:
_print_arguments
(
args
)
start_procs
(
args
)
if
__name__
==
"__main__"
:
launch
()
launch
(
args
)
python/paddle/distributed/utils.py
0 → 100644
浏览文件 @
24a063f6
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
functools
import
logging
import
socket
import
time
import
os
import
signal
import
copy
import
sys
import
subprocess
from
contextlib
import
closing
import
socket
logger
=
logging
.
getLogger
(
"root"
)
logger
.
propagate
=
False
class
Hdfs
(
object
):
def
__init__
(
self
):
self
.
hdfs_ugi
=
None
self
.
hdfs_name
=
None
self
.
hdfs_path
=
None
def
is_valid
(
self
):
return
self
.
hdfs_ugi
is
not
None
and
\
self
.
hdfs_name
is
not
None
and
\
self
.
hdfs_path
is
not
None
def
__str__
(
self
):
return
"hdfs_ugi:{} hdfs_name:{} hdfs_path{}"
.
format
(
self
.
hdfs_ugi
,
self
.
hdfs_name
,
self
.
hdfs_path
)
def
__eq__
(
self
,
n
):
return
self
.
hdfs_ugi
==
n
.
hdfs_ugi
and
\
self
.
hdfs_name
==
n
.
hdfs_name
and
\
self
.
hdfs_path
==
n
.
hdfs_path
def
__ne__
(
self
,
n
):
return
not
self
==
n
class
Cluster
(
object
):
def
__init__
(
self
,
hdfs
):
self
.
job_server
=
None
self
.
pods
=
[]
self
.
hdfs
=
None
self
.
job_stage_flag
=
None
def
__str__
(
self
):
return
"job_server:{} pods:{} job_stage_flag:{} hdfs:{}"
.
format
(
self
.
job_server
,
[
str
(
pod
)
for
pod
in
self
.
pods
],
self
.
job_stage_flag
,
self
.
hdfs
)
def
__eq__
(
self
,
cluster
):
if
len
(
self
.
pods
)
!=
len
(
cluster
.
pods
):
return
False
for
a
,
b
in
zip
(
self
.
pods
,
cluster
.
pods
):
if
a
!=
b
:
return
False
if
self
.
job_stage_flag
!=
cluster
.
job_stage_flag
:
return
False
return
True
def
__ne__
(
self
,
cluster
):
return
not
self
.
__eq__
(
cluster
)
def
update_pods
(
cluster
):
self
.
pods
=
copy
.
copy
(
cluster
.
pods
)
def
trainers_nranks
(
self
):
return
len
(
self
.
trainers_endpoints
())
def
pods_nranks
(
self
):
return
len
(
self
.
pods
)
def
trainers_endpoints
(
self
):
r
=
[]
for
pod
in
self
.
pods
:
for
t
in
pod
.
trainers
:
r
.
append
(
t
.
endpoint
)
return
r
def
pods_endpoints
(
self
):
r
=
[]
for
pod
in
self
.
pods
:
ep
=
"{}:{}"
.
format
(
pod
.
addr
,
pod
.
port
)
assert
pod
.
port
!=
None
and
pod
.
addr
!=
None
,
"{} not a valid endpoint"
.
format
(
ep
)
r
.
append
(
ep
)
return
r
def
get_pod_by_id
(
self
,
pod_id
):
for
pod
in
self
.
pods
:
if
str
(
pod_id
)
==
str
(
pod
.
id
):
return
pod
return
None
class
JobServer
(
object
):
def
__init__
(
self
):
self
.
endpoint
=
None
def
__str__
(
self
):
return
"{}"
.
format
(
self
.
endpoint
)
def
__eq__
(
self
,
j
):
return
self
.
endpint
==
j
.
endpoint
def
__ne__
(
self
,
j
):
return
not
self
==
j
class
Trainer
(
object
):
def
__init__
(
self
):
self
.
gpus
=
[]
self
.
endpoint
=
None
self
.
rank
=
None
def
__str__
(
self
):
return
"gpu:{} endpoint:{} rank:{}"
.
format
(
self
.
gpus
,
self
.
endpoint
,
self
.
rank
)
def
__eq__
(
self
,
t
):
if
len
(
self
.
gpus
)
!=
len
(
t
.
gpus
):
return
False
if
self
.
endpoint
!=
t
.
endpoint
or
\
self
.
rank
!=
t
.
rank
:
return
False
for
a
,
b
in
zip
(
self
.
gpus
,
t
.
gpus
):
if
a
!=
b
:
return
False
return
True
def
__ne__
(
self
,
t
):
return
not
self
==
t
def
rank
(
self
):
return
self
.
rank
class
Pod
(
object
):
def
__init__
(
self
):
self
.
rank
=
None
self
.
id
=
None
self
.
addr
=
None
self
.
port
=
None
self
.
trainers
=
[]
self
.
gpus
=
[]
def
__str__
(
self
):
return
"rank:{} id:{} addr:{} port:{} visible_gpu:{} trainers:{}"
.
format
(
self
.
rank
,
self
.
id
,
self
.
addr
,
self
.
port
,
self
.
gpus
,
[
str
(
t
)
for
t
in
self
.
trainers
])
def
__eq__
(
self
,
pod
):
if
self
.
rank
!=
pod
.
rank
or
\
self
.
id
!=
pod
.
id
or
\
self
.
addr
!=
pod
.
addr
or
\
self
.
port
!=
pod
.
port
:
logger
.
debug
(
"pod {} != pod"
.
format
(
self
,
pod
))
return
False
if
len
(
self
.
trainers
)
!=
len
(
pod
.
trainers
):
logger
.
debug
(
"trainers {} != {}"
.
format
(
self
.
trainers
,
pod
.
trainers
))
return
False
for
i
in
range
(
len
(
self
.
trainers
)):
if
self
.
trainers
[
i
]
!=
pod
.
trainers
[
i
]:
logger
.
debug
(
"trainer {} != {}"
.
format
(
self
.
trainers
[
i
],
pod
.
trainers
[
i
]))
return
False
return
True
def
__ne__
(
self
,
pod
):
return
not
self
==
pod
def
parse_response
(
self
,
res_pods
):
pass
def
rank
(
self
):
return
self
.
rank
def
get_visible_gpus
(
self
):
r
=
""
for
g
in
self
.
gpus
:
r
+=
"{},"
.
format
(
g
)
assert
r
!=
""
,
"this pod {} can't see any gpus"
.
format
(
self
)
r
=
r
[:
-
1
]
return
r
def
get_logger
(
log_level
,
name
=
"root"
):
logger
=
logging
.
getLogger
(
name
)
logger
.
setLevel
(
log_level
)
log_handler
=
logging
.
StreamHandler
()
log_format
=
logging
.
Formatter
(
'%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
)
log_handler
.
setFormatter
(
log_format
)
logger
.
addHandler
(
log_handler
)
return
logger
def
get_cluster
(
node_ips
,
node_ip
,
paddle_ports
,
selected_gpus
):
assert
type
(
paddle_ports
)
is
list
,
"paddle_ports must be list"
cluster
=
Cluster
(
hdfs
=
None
)
trainer_rank
=
0
for
node_rank
,
ip
in
enumerate
(
node_ips
):
pod
=
Pod
()
pod
.
rank
=
node_rank
pod
.
addr
=
ip
for
i
in
range
(
len
(
selected_gpus
)):
trainer
=
Trainer
()
trainer
.
gpus
.
append
(
selected_gpus
[
i
])
trainer
.
endpoint
=
"%s:%d"
%
(
ip
,
paddle_ports
[
i
])
trainer
.
rank
=
trainer_rank
trainer_rank
+=
1
pod
.
trainers
.
append
(
trainer
)
cluster
.
pods
.
append
(
pod
)
pod_rank
=
node_ips
.
index
(
node_ip
)
return
cluster
,
cluster
.
pods
[
pod_rank
]
def
terminate_local_procs
(
procs
):
for
p
in
procs
:
if
p
.
proc
.
poll
()
is
None
:
p
.
proc
.
terminate
()
p
.
log_fn
.
close
()
logger
.
debug
(
"terminate process id:{}"
.
format
(
p
.
proc
.
pid
))
# wait all process terminiated
# time.sleep(3)
for
step
in
range
(
0
,
50
):
alive
=
False
for
p
in
procs
:
if
p
.
proc
.
poll
()
is
None
:
# not termniate
os
.
kill
(
p
.
proc
.
pid
,
signal
.
SIGKILL
)
alive
=
True
if
not
alive
:
logger
.
info
(
"terminate all the procs"
)
return
time
.
sleep
(
3
)
logger
.
fatal
(
"can't kill all process and exit"
)
exit
(
1
)
def
get_host_name_ip
():
try
:
host_name
=
socket
.
gethostname
()
host_ip
=
socket
.
gethostbyname
(
host_name
)
return
host_name
,
host_ip
except
:
return
None
def
add_arguments
(
argname
,
type
,
default
,
help
,
argparser
,
**
kwargs
):
"""Add argparse's argument.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
add_argument("name", str, "Jonh", "User name.", parser)
args = parser.parse_args()
"""
type
=
distutils
.
util
.
strtobool
if
type
==
bool
else
type
argparser
.
add_argument
(
"--"
+
argname
,
default
=
default
,
type
=
type
,
help
=
help
+
' Default: %(default)s.'
,
**
kwargs
)
def
find_free_ports
(
num
):
def
__free_port
():
with
closing
(
socket
.
socket
(
socket
.
AF_INET
,
socket
.
SOCK_STREAM
))
as
s
:
s
.
bind
((
''
,
0
))
return
s
.
getsockname
()[
1
]
port_set
=
set
()
step
=
0
while
True
:
port
=
__free_port
()
if
port
not
in
port_set
:
port_set
.
add
(
port
)
if
len
(
port_set
)
>=
num
:
return
port_set
step
+=
1
if
step
>
100
:
print
(
"can't find avilable port and use the specified static port now!"
)
return
None
return
None
class
TrainerProc
(
object
):
def
__init__
(
self
):
self
.
proc
=
None
self
.
log_fn
=
None
self
.
rank
=
None
self
.
cmd
=
None
def
start_local_trainers
(
cluster
,
pod
,
training_script
,
training_script_args
,
log_dir
=
None
):
current_env
=
copy
.
copy
(
os
.
environ
.
copy
())
#paddle broadcast ncclUniqueId use socket, and
#proxy maybe make trainers unreachable, so delete them.
#if we set them to "", grpc will log error message "bad uri"
#so just delete them.
current_env
.
pop
(
"http_proxy"
,
None
)
current_env
.
pop
(
"https_proxy"
,
None
)
procs
=
[]
for
idx
,
t
in
enumerate
(
pod
.
trainers
):
proc_env
=
{
"FLAGS_selected_gpus"
:
"%s"
%
","
.
join
([
str
(
g
)
for
g
in
t
.
gpus
]),
"PADDLE_TRAINER_ID"
:
"%d"
%
t
.
rank
,
"PADDLE_CURRENT_ENDPOINT"
:
"%s"
%
t
.
endpoint
,
"PADDLE_TRAINERS_NUM"
:
"%d"
%
cluster
.
trainers_nranks
(),
"PADDLE_TRAINER_ENDPOINTS"
:
","
.
join
(
cluster
.
trainers_endpoints
())
}
current_env
.
update
(
proc_env
)
logger
.
debug
(
"trainer proc env:{}"
.
format
(
current_env
))
cmd
=
[
sys
.
executable
,
"-u"
,
training_script
]
+
training_script_args
logger
.
info
(
"start trainer proc:{} env:{}"
.
format
(
cmd
,
proc_env
))
fn
=
None
if
log_dir
is
not
None
:
os
.
system
(
"mkdir -p {}"
.
format
(
log_dir
))
fn
=
open
(
"%s/workerlog.%d"
%
(
log_dir
,
idx
),
"a"
)
proc
=
subprocess
.
Popen
(
cmd
,
env
=
current_env
,
stdout
=
fn
,
stderr
=
fn
)
else
:
proc
=
subprocess
.
Popen
(
cmd
,
env
=
current_env
)
tp
=
TrainerProc
()
tp
.
proc
=
proc
tp
.
rank
=
t
.
rank
tp
.
log_fn
=
fn
tp
.
cmd
=
cmd
procs
.
append
(
tp
)
return
procs
def
watch_local_trainers
(
procs
,
nranks
):
try
:
error
=
False
error_rank
=
[]
# wait all process finish or one error
alive
=
False
for
p
in
procs
:
ret
=
p
.
proc
.
poll
()
if
ret
is
None
:
alive
=
True
elif
ret
!=
0
:
error
=
True
error_rank
.
append
(
p
.
rank
)
if
error
:
terminate_local_procs
(
procs
)
exit
(
1
)
except
KeyboardInterrupt
:
logger
.
warning
(
"KeyboardInterrupt, exit"
)
terminate_local_procs
(
procs
)
raise
except
SystemExit
:
logger
.
error
(
"ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log."
.
format
(
nranks
,
error_rank
))
terminate_local_procs
(
procs
)
raise
except
:
logger
.
error
(
"ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log."
.
format
(
nranks
,
error_rank
))
terminate_local_procs
(
procs
)
raise
return
alive
python/paddle/fluid/incubate/fleet/collective/__init__.py
浏览文件 @
24a063f6
...
...
@@ -26,10 +26,14 @@ from paddle.fluid.incubate.fleet.base.fleet_base import Mode
from
paddle.fluid.incubate.fleet.base.fleet_base
import
DistributedOptimizer
from
paddle.fluid
import
compiler
from
paddle.distributed.fs_wrapper
import
LocalFS
,
BDFS
import
os
import
sys
import
six
import
json
import
re
import
shutil
class
LambConfig
(
object
):
...
...
@@ -42,6 +46,21 @@ class DistFCConfig(object):
pass
class
TrainStatus
(
object
):
def
__init__
(
self
,
epoch_no
=-
1
):
# completed epoch
self
.
_epoch_no
=
epoch_no
def
next
(
self
):
return
self
.
_epoch_no
+
1
def
__eq__
(
self
,
t
):
return
self
.
_epoch_no
==
t
.
_epoch_no
def
__ne__
(
self
,
t
):
return
not
self
==
t
class
Collective
(
Fleet
):
def
__init__
(
self
):
super
(
Collective
,
self
).
__init__
(
Mode
.
COLLECTIVE
)
...
...
@@ -51,6 +70,8 @@ class Collective(Fleet):
self
.
_origin_program
=
None
self
.
_transpiled_program
=
None
self
.
main_program
=
None
self
.
_checkoint_prefix
=
"__paddle_fleet_checkpoint__"
self
.
_param_file_name
=
"_paddle_fleet_param__"
def
init_worker
(
self
):
logging
.
warn
(
...
...
@@ -103,7 +124,11 @@ class Collective(Fleet):
executor
,
main_program
,
None
,
None
,
export_for_deployment
)
def
save_persistables
(
self
,
executor
,
dirname
,
main_program
=
None
):
def
save_persistables
(
self
,
executor
,
dirname
,
main_program
=
None
,
filename
=
None
):
"""
This function filters out all variables with `persistable==True` from
the give `main_program` and then saves these variables to the folder
...
...
@@ -125,7 +150,182 @@ class Collective(Fleet):
"In fleet.save_inference_model() function, main_program "
\
"must be as Program type."
io
.
save_persistables
(
executor
,
dirname
,
main_program
,
None
)
io
.
save_persistables
(
executor
,
dirname
,
main_program
,
filename
=
filename
)
def
_save_train_status
(
self
,
path
,
train_status
):
d
=
{}
d
[
"epoch_no"
]
=
train_status
.
_epoch_no
file_name
=
"{}/fleet_train_status"
.
format
(
path
)
with
open
(
file_name
,
'w'
)
as
f
:
json
.
dump
(
d
,
f
)
def
_load_train_status
(
self
,
path
):
file_name
=
"{}/fleet_train_status"
.
format
(
path
)
r
=
TrainStatus
()
if
not
os
.
path
.
isfile
(
file_name
):
return
r
d
=
{}
with
open
(
file_name
,
'r'
)
as
f
:
d
=
json
.
load
(
f
)
assert
"epoch_no"
in
d
,
"Can't find epoch_no in dict from train_status file:{}"
.
format
(
d
)
r
.
_epoch_no
=
d
[
"epoch_no"
]
assert
r
.
_epoch_no
>=
0
,
"Data in checkpoint file is not valid:{}"
.
format
(
d
)
return
r
def
_get_last_checkpoint_no
(
self
,
root_path
,
fs
):
"""
only get the first depth
"""
max_no
=
-
1
d
=
{}
dirs
=
fs
.
list_dirs
(
root_path
)
for
dir
in
dirs
:
g
=
dir
.
split
(
"."
)
if
len
(
g
)
!=
2
:
continue
if
g
[
0
]
!=
"__paddle_fleet_checkpoint__"
:
continue
try
:
n
=
int
(
g
[
1
])
if
n
>
max_no
:
max_no
=
n
except
:
continue
return
max_no
def
clean_redundant_check_points
(
self
,
root_path
,
fs
=
LocalFS
(),
checkpoint_num
=
1
):
max_no
=
self
.
_get_last_checkpoint_no
(
root_path
,
fs
)
if
max_no
<
0
:
return
if
checkpoint_num
<
1
:
checkpoint_num
=
1
dirs
=
fs
.
list_dirs
(
root_path
)
for
dir
in
dirs
:
g
=
dir
.
split
(
"."
)
if
len
(
g
)
!=
2
:
continue
if
g
[
0
]
!=
self
.
_checkoint_prefix
:
continue
try
:
n
=
int
(
g
[
1
])
if
n
<=
max_no
-
checkpoint_num
:
path
=
"{}/{}.{}"
.
format
(
root_path
,
self
.
_checkoint_prefix
,
n
)
fs
.
rmr
(
path
)
except
Exception
as
e
:
print
(
e
)
continue
def
save_check_point
(
self
,
executor
,
path
,
train_status
,
main_program
=
None
,
fs
=
LocalFS
(),
local_cache_path
=
".cache"
,
remain_all_checkpoint
=
True
):
"""
This function save persistables and current epoch num to path.
"""
if
main_program
==
None
:
main_program
=
self
.
_transpiled_program
if
not
fs
.
stat
(
path
):
fs
.
mkdir
(
path
)
max_no
=
self
.
_get_last_checkpoint_no
(
path
,
fs
=
fs
)
if
max_no
<
0
:
max_no
=
-
1
real_path
=
"{}/{}.{}"
.
format
(
path
,
self
.
_checkoint_prefix
,
max_no
+
1
)
tmp_path
=
"{}.tmp"
.
format
(
real_path
)
saved_path
=
tmp_path
local_fs
=
LocalFS
()
cache_path
=
None
if
fs
.
need_upload_download
():
cache_path
=
"{}/{}.{}.saved_cache"
.
format
(
local_cache_path
,
self
.
_checkoint_prefix
,
max_no
+
1
)
if
not
local_fs
.
stat
(
cache_path
):
local_fs
.
mkdir
(
cache_path
)
saved_path
=
cache_path
self
.
save_persistables
(
executor
=
executor
,
dirname
=
saved_path
,
main_program
=
main_program
,
filename
=
self
.
_param_file_name
)
self
.
_save_train_status
(
path
=
saved_path
,
train_status
=
train_status
)
if
fs
.
need_upload_download
():
fs
.
delete
(
tmp_path
)
fs
.
upload
(
cache_path
,
tmp_path
)
fs
.
mv
(
tmp_path
,
real_path
)
if
not
remain_all_checkpoint
:
self
.
clean_redundant_check_points
(
path
)
def
load_check_point
(
self
,
executor
,
path
,
trainer_id
,
main_program
=
None
,
fs
=
LocalFS
(),
local_cache_path
=
".cache"
,
ignore_empty
=
True
):
"""
This function load persistables and current epoch num from path.
"""
max_no
=
self
.
_get_last_checkpoint_no
(
path
,
fs
)
if
not
ignore_empty
:
assert
max_no
>=
0
,
"Can't find checkpoint"
if
max_no
<
0
:
return
None
local_fs
=
LocalFS
()
if
fs
.
need_upload_download
():
cache_path
=
"{}/{}.{}.load_cache.{}"
.
format
(
local_cache_path
,
self
.
_checkoint_prefix
,
max_no
,
trainer_id
)
if
local_fs
.
stat
(
cache_path
):
local_fs
.
delete
(
cache_path
)
real_path
=
"{}/{}.{}"
.
format
(
path
,
self
.
_checkoint_prefix
,
max_no
)
load_path
=
real_path
if
fs
.
need_upload_download
():
fs
.
download
(
real_path
,
cache_path
)
load_path
=
cache_path
if
main_program
==
None
:
main_program
=
self
.
_transpiled_program
io
.
load_persistables
(
executor
=
executor
,
dirname
=
load_path
,
main_program
=
main_program
,
filename
=
self
.
_param_file_name
)
return
self
.
_load_train_status
(
load_path
)
fleet
=
Collective
()
...
...
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
24a063f6
...
...
@@ -28,6 +28,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_communicator_geo)
list
(
APPEND MIXED_DIST_TEST_OPS test_communicator_half_async
)
list
(
APPEND MIXED_DIST_TEST_OPS test_communicator_sync
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_api_input
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_checkpoint
)
foreach
(
TEST_OP
${
MIXED_DIST_TEST_OPS
}
)
list
(
REMOVE_ITEM TEST_OPS
${
TEST_OP
}
)
endforeach
()
...
...
@@ -301,6 +302,7 @@ if(WITH_DISTRIBUTE)
if
(
WITH_GPU
)
# NOTE. test_launch only work in gpu collective mode
bash_test_modules
(
test_launch MODULES test_launch.sh ENVS PADDLE_BINARY_DIR=
${
PADDLE_BINARY_DIR
}
)
py_test_modules
(
test_fleet_checkpoint MODULES test_fleet_checkpoint
)
endif
()
bash_test_modules
(
test_launch_ps MODULES test_launch_ps.sh ENVS PADDLE_BINARY_DIR=
${
PADDLE_BINARY_DIR
}
)
...
...
python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py
0 → 100644
浏览文件 @
24a063f6
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
paddle.fluid
as
fluid
import
paddle.fluid.incubate.fleet.base.role_maker
as
role_maker
from
paddle.fluid.incubate.fleet.collective
import
CollectiveOptimizer
,
fleet
,
TrainStatus
import
os
from
paddle.distributed.fs_wrapper
import
LocalFS
,
BDFS
class
FleetTest
(
unittest
.
TestCase
):
def
_test_check_point
(
self
,
fs
,
dir_path
):
file_name
=
"persistables"
os
.
environ
[
"TRAINING_ROLE"
]
=
"TRAINER"
os
.
environ
[
"PADDLE_TRAINER_ID"
]
=
"0"
os
.
environ
[
"PADDLE_TRAINER_ENDPOINTS"
]
=
"127.0.0.1:6070"
role
=
role_maker
.
PaddleCloudRoleMaker
(
is_collective
=
True
)
fleet
.
init
(
role
)
image
=
fluid
.
data
(
name
=
'img'
,
shape
=
[
None
,
28
,
28
],
dtype
=
'float32'
)
label
=
fluid
.
data
(
name
=
'label'
,
shape
=
[
None
,
1
],
dtype
=
'int64'
)
feeder
=
fluid
.
DataFeeder
(
feed_list
=
[
image
,
label
],
place
=
fluid
.
CPUPlace
())
predict
=
fluid
.
layers
.
fc
(
input
=
image
,
size
=
10
,
act
=
'softmax'
)
loss
=
fluid
.
layers
.
cross_entropy
(
input
=
predict
,
label
=
label
)
avg_loss
=
fluid
.
layers
.
mean
(
loss
)
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
learning_rate
=
0.001
)
dist_optimizer
=
fleet
.
distributed_optimizer
(
optimizer
)
dist_optimizer
.
minimize
(
avg_loss
)
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
())
exe
.
run
(
fluid
.
default_startup_program
())
status
=
TrainStatus
(
2
)
fleet
.
save_check_point
(
exe
,
dir_path
,
train_status
=
status
,
fs
=
fs
)
n1
=
fleet
.
_get_last_checkpoint_no
(
dir_path
,
fs
=
fs
)
status2
=
fleet
.
load_check_point
(
exe
,
dir_path
,
trainer_id
=
0
,
fs
=
fs
)
self
.
assertEqual
(
status2
,
status
)
fleet
.
save_check_point
(
exe
,
dir_path
,
train_status
=
status
,
fs
=
fs
)
n2
=
fleet
.
_get_last_checkpoint_no
(
dir_path
,
fs
=
fs
)
self
.
assertEqual
(
n2
,
n1
+
1
)
fleet
.
clean_redundant_check_points
(
dir_path
,
fs
=
fs
)
def
test_hdfs_check_point
(
self
):
try
:
fs
=
BDFS
(
"xxxx"
,
"xxxx"
,
1
*
1000
,
1
*
1000
)
dir_path
=
"/user/Paddle_Data/gongweibao/edl_test/my_paddle_model"
self
.
_test_check_point
(
fs
,
dir_path
)
except
Exception
as
e
:
print
(
e
)
def
test_local_check_point
(
self
):
fs
=
LocalFS
()
dir_path
=
"./my_paddle_model"
self
.
_test_check_point
(
fs
,
dir_path
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_launch.sh
浏览文件 @
24a063f6
...
...
@@ -6,6 +6,7 @@ launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py
python
${
launch_py
}
multi_process.py
# use paddlecloud
echo
"begin test use paddlecloud"
cluster_node_ips
=
"10.0.0.1"
node_ip
=
"10.0.0.1"
export
PADDLE_TRAINERS_NUM
=
2
...
...
@@ -14,7 +15,7 @@ export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
export
PADDLE_TRAINER_ID
=
0
export
PADDLE_PORT
=
35019
export
PADDLE
_PORTS_NUM
=
2
export
TRAINER
_PORTS_NUM
=
2
distributed_args
=
"--use_paddlecloud --cluster_node_ips=
${
cluster_node_ips
}
--node_ip=
${
node_ip
}
--selected_gpus=0,1 --log_dir=testlog"
CUDA_VISIBLE_DEVICES
=
0,1 python
${
launch_py
}
${
distributed_args
}
multi_process.py
...
...
@@ -47,8 +48,9 @@ if [ -f $file_1 ]; then
rm
$file_1
fi
unset
PADDLE_PORT
unset
PADDLE
_PORTS_NUM
unset
TRAINER
_PORTS_NUM
echo
""
echo
"paddle.distributed.launch async poll process test"
...
...
python/requirements.txt
浏览文件 @
24a063f6
...
...
@@ -19,3 +19,4 @@ decorator
prettytable
objgraph
astor
pathlib
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录