Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
3a2a7116
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
3a2a7116
编写于
9月 03, 2020
作者:
D
danleifeng
提交者:
GitHub
9月 03, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
【paddle.fleet】simplify fleetrun log infos (#26888)
* print detailed and clear log infos; test=develop
上级
e35ad3ee
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
87 addition
and
16 deletion
+87
-16
python/paddle/distributed/fleet/launch.py
python/paddle/distributed/fleet/launch.py
+35
-12
python/paddle/distributed/fleet/launch_utils.py
python/paddle/distributed/fleet/launch_utils.py
+52
-4
未找到文件。
python/paddle/distributed/fleet/launch.py
浏览文件 @
3a2a7116
...
...
@@ -200,11 +200,11 @@ def launch_collective(args):
start_port
=
os
.
environ
.
get
(
'FLAGS_START_PORT'
)
if
cloud_utils
.
use_paddlecloud
()
and
trainers_num
!=
1
:
cluster
,
pod
=
cloud_utils
.
get_cloud_cluster
(
args
.
ips
,
gpus
,
start_port
)
logger
.
info
(
"get cluster from cloud:{}"
.
format
(
cluster
))
logger
.
debug
(
"get cluster from cloud:{}"
.
format
(
cluster
))
else
:
# trainers_num = 1 or not use paddlecloud ips="a,b"
cluster
,
pod
=
get_cluster_from_args
(
args
,
gpus
)
logger
.
info
(
"get cluster from args:{}"
.
format
(
cluster
))
logger
.
debug
(
"get cluster from args:{}"
.
format
(
cluster
))
procs
=
start_local_trainers
(
cluster
,
...
...
@@ -217,7 +217,8 @@ def launch_collective(args):
alive
=
watch_local_trainers
(
procs
,
cluster
.
trainers_nranks
())
if
not
alive
:
logger
.
info
(
"Local procs complete, POD info:{}"
.
format
(
pod
))
logger
.
info
(
"Local processes completed."
)
logger
.
debug
(
"POD info:{}"
.
format
(
pod
))
break
time
.
sleep
(
3
)
...
...
@@ -313,18 +314,26 @@ def launch_ps(args):
cmds
=
[]
log_fns
=
[]
for
idx
,
cur_server
in
enumerate
(
pod
.
servers
):
current_env
.
update
(
{
proc_env
=
{
"PADDLE_PSERVERS_IP_PORT_LIST"
:
server_endpoints
,
"PADDLE_PORT"
:
cur_server
.
endpoint
.
split
(
":"
)[
1
],
"TRAINING_ROLE"
:
"PSERVER"
,
"PADDLE_TRAINERS_NUM"
:
str
(
worker_num
),
"POD_IP"
:
cur_server
.
endpoint
.
split
(
":"
)[
0
]
})
}
current_env
.
update
(
proc_env
)
cmd
=
[
sys
.
executable
,
"-u"
,
args
.
training_script
]
+
args
.
training_script_args
cmds
.
append
(
cmd
)
if
idx
==
0
:
logger
.
info
(
"Local server start {} processes. First process distributed "
"environment info (Only For Debug): {}"
.
format
(
len
(
pod
.
servers
),
pretty_print_envs
(
proc_env
,
(
"Distributed Envs"
,
"Value"
))))
if
args
.
log_dir
is
not
None
:
os
.
system
(
"mkdir -p {}"
.
format
(
args
.
log_dir
))
fn
=
open
(
"%s/serverlog.%d"
%
(
args
.
log_dir
,
idx
),
"w"
)
...
...
@@ -338,21 +347,32 @@ def launch_ps(args):
tp
.
rank
=
cur_server
.
rank
tp
.
local_rank
=
idx
tp
.
log_fn
=
fn
tp
.
log_offset
=
0
if
fn
else
None
tp
.
log_offset
=
fn
.
tell
()
if
fn
else
None
tp
.
cmd
=
cmd
procs
.
append
(
tp
)
for
idx
,
cur_worker
in
enumerate
(
pod
.
workers
):
current_env
.
update
(
{
proc_env
=
{
"PADDLE_PSERVERS_IP_PORT_LIST"
:
server_endpoints
,
"PADDLE_TRAINER_ENDPOINTS"
:
worker_endpoints
,
"PADDLE_TRAINERS_NUM"
:
str
(
worker_num
),
"TRAINING_ROLE"
:
"TRAINER"
,
"PADDLE_TRAINER_ID"
:
str
(
cur_worker
.
rank
)
})
}
current_env
.
update
(
proc_env
)
cmd
=
[
sys
.
executable
,
"-u"
,
args
.
training_script
]
+
args
.
training_script_args
cmds
.
append
(
cmd
)
if
idx
==
0
:
logger
.
info
(
"Local worker start {} processes. First process distributed "
"environment info (Only For Debug): {}"
.
format
(
len
(
pod
.
workers
),
pretty_print_envs
(
proc_env
,
(
"Distributed Envs"
,
"Value"
))))
if
args
.
log_dir
is
not
None
:
os
.
system
(
"mkdir -p {}"
.
format
(
args
.
log_dir
))
fn
=
open
(
"%s/workerlog.%d"
%
(
args
.
log_dir
,
idx
),
"w"
)
...
...
@@ -366,11 +386,14 @@ def launch_ps(args):
tp
.
rank
=
cur_worker
.
rank
tp
.
local_rank
=
idx
tp
.
log_fn
=
fn
tp
.
log_offset
=
0
if
fn
else
None
tp
.
log_offset
=
fn
.
tell
()
if
fn
else
None
tp
.
cmd
=
cmd
procs
.
append
(
tp
)
logger
.
info
(
"Please check servers and workers logs in {}/workerlog.* and {}/serverlog.*"
.
format
(
args
.
log_dir
,
args
.
log_dir
))
# only wait worker to finish here
for
i
,
proc
in
enumerate
(
procs
):
if
i
<
len
(
pod
.
servers
):
...
...
@@ -403,16 +426,16 @@ def launch():
cuda_device_num
=
fluid
.
core
.
get_cuda_device_count
()
if
len
(
has_ps_args
)
>
0
or
cuda_device_num
==
0
:
logger
.
info
(
"Run parameter-sever cpu mode. pserver args:{}, cuda count:{}"
.
"Run parameter-sever cpu mode. pserver arg
ument
s:{}, cuda count:{}"
.
format
(
has_ps_args
,
cuda_device_num
))
launch_ps
(
args
)
elif
len
(
has_collective_args
)
>
0
:
logger
.
info
(
"Run collective gpu mode. gpu args:{}, cuda count:{}"
.
logger
.
info
(
"Run collective gpu mode. gpu arg
ument
s:{}, cuda count:{}"
.
format
(
has_collective_args
,
cuda_device_num
))
launch_collective
(
args
)
else
:
logger
.
warning
(
"Not found distinct args. Default use gpu collective mode"
)
"Not found distinct arg
ument
s. Default use gpu collective mode"
)
launch_collective
(
args
)
...
...
python/paddle/distributed/fleet/launch_utils.py
浏览文件 @
3a2a7116
...
...
@@ -253,7 +253,8 @@ def terminate_local_procs(procs):
for
p
in
procs
:
if
p
.
proc
.
poll
()
is
None
:
p
.
proc
.
terminate
()
p
.
log_fn
.
close
()
if
p
.
log_fn
:
p
.
log_fn
.
close
()
logger
.
debug
(
"terminate process id:{}"
.
format
(
p
.
proc
.
pid
))
#wait all process terminiated
...
...
@@ -338,6 +339,45 @@ def get_ports(num, offset):
return
ports
def
pretty_print_envs
(
envs
,
header
=
None
):
spacing
=
2
max_k
=
40
max_v
=
45
for
k
,
v
in
envs
.
items
():
max_k
=
max
(
max_k
,
len
(
k
))
h_format
=
"{{:^{}s}}{}{{:<{}s}}
\n
"
.
format
(
max_k
,
" "
*
spacing
,
max_v
)
l_format
=
"{{:<{}s}}{{}}{{:<{}s}}
\n
"
.
format
(
max_k
,
max_v
)
length
=
max_k
+
max_v
+
spacing
border
=
""
.
join
([
"="
]
*
length
)
line
=
""
.
join
([
"-"
]
*
length
)
draws
=
""
draws
+=
border
+
"
\n
"
if
header
:
draws
+=
h_format
.
format
(
header
[
0
],
header
[
1
])
else
:
draws
+=
h_format
.
format
(
"fleetrun Distributed Envs"
,
"Value"
)
draws
+=
line
+
"
\n
"
for
k
,
v
in
envs
.
items
():
if
isinstance
(
v
,
str
)
and
len
(
v
)
>=
max_v
:
str_v
=
"... "
+
v
[
-
41
:]
else
:
str_v
=
v
draws
+=
l_format
.
format
(
k
,
" "
*
spacing
,
str
(
str_v
))
draws
+=
border
_str
=
"
\n
{}
\n
"
.
format
(
draws
)
return
_str
class
TrainerProc
(
object
):
def
__init__
(
self
):
self
.
proc
=
None
...
...
@@ -373,11 +413,19 @@ def start_local_trainers(cluster,
current_env
.
update
(
proc_env
)
logger
.
debug
(
"trainer proc env:{}"
.
format
(
current_env
))
cmd
=
[
sys
.
executable
,
"-u"
,
training_script
]
+
training_script_args
logger
.
info
(
"start trainer proc:{} env:{}"
.
format
(
cmd
,
proc_env
))
logger
.
debug
(
"start trainer proc{} env:{}"
.
format
(
cmd
,
current_env
))
if
idx
==
0
:
logger
.
info
(
"Local start {} processes. First process distributed "
"environment info (Only For Debug): {}"
.
format
(
len
(
pod
.
trainers
),
pretty_print_envs
(
proc_env
,
(
"Distributed Envs"
,
"Value"
))))
logger
.
info
(
"More details for debug about commands and environments are written in {}/run.sh"
.
format
(
log_dir
))
fn
=
None
if
log_dir
is
not
None
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录