Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
c91aaced
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
c91aaced
编写于
8月 08, 2022
作者:
K
kuizhiqing
提交者:
GitHub
8月 08, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[LAUNCH] make launch Compatible (#44881)
* make launch compatible * fix ut * fix log offset
上级
85df6d73
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
58 addition
and
37 deletion
+58
-37
python/paddle/distributed/launch/context/__init__.py
python/paddle/distributed/launch/context/__init__.py
+3
-0
python/paddle/distributed/launch/context/args_envs.py
python/paddle/distributed/launch/context/args_envs.py
+4
-1
python/paddle/distributed/launch/context/device.py
python/paddle/distributed/launch/context/device.py
+3
-5
python/paddle/distributed/launch/controllers/collective.py
python/paddle/distributed/launch/controllers/collective.py
+3
-1
python/paddle/distributed/launch/controllers/controller.py
python/paddle/distributed/launch/controllers/controller.py
+7
-6
python/paddle/distributed/launch/controllers/ps.py
python/paddle/distributed/launch/controllers/ps.py
+8
-8
python/paddle/distributed/launch/job/container.py
python/paddle/distributed/launch/job/container.py
+11
-2
python/paddle/fluid/tests/unittests/test_run.py
python/paddle/fluid/tests/unittests/test_run.py
+19
-14
未找到文件。
python/paddle/distributed/launch/context/__init__.py
浏览文件 @
c91aaced
...
...
@@ -101,4 +101,7 @@ class Context(object):
def
set_env_in_args
(
self
):
for
k
,
v
in
env_args_mapping
.
items
():
if
k
in
self
.
envs
:
print
(
f
"LAUNCH WARNNING args
{
v
}
is override by env
{
self
.
envs
[
k
]
}
"
)
setattr
(
self
.
args
,
v
,
self
.
envs
[
k
])
python/paddle/distributed/launch/context/args_envs.py
浏览文件 @
c91aaced
...
...
@@ -96,9 +96,12 @@ def parse_args():
help
=
"unique id of the job. Default default"
)
base_group
.
add_argument
(
"--devices"
,
"--gpus"
,
"--npus"
,
"--xpus"
,
type
=
str
,
default
=
None
,
help
=
"accelerate devices. as --gpus,npus,xps"
)
help
=
"accelerate devices. as --gpus,npus,xp
u
s"
)
base_group
.
add_argument
(
"--host"
,
type
=
str
,
default
=
None
,
help
=
"host ip"
)
...
...
python/paddle/distributed/launch/context/device.py
浏览文件 @
c91aaced
...
...
@@ -105,10 +105,9 @@ class Device(object):
os
.
getenv
(
'PADDLE_XCCL_BACKEND'
).
upper
())
if
visible_devices_str
in
os
.
environ
:
visible_devices
=
os
.
getenv
(
visible_devices_str
)
elif
'CUDA_VISIBLE_DEVICES'
in
os
.
environ
or
'NVIDIA_VISIBLE_DEVICES'
in
os
.
environ
:
elif
'CUDA_VISIBLE_DEVICES'
in
os
.
environ
:
dev
.
_dtype
=
DeviceType
.
GPU
visible_devices
=
os
.
getenv
(
"CUDA_VISIBLE_DEVICES"
)
or
os
.
getenv
(
"NVIDIA_VISIBLE_DEVICES"
)
visible_devices
=
os
.
getenv
(
"CUDA_VISIBLE_DEVICES"
)
elif
'XPU_VISIBLE_DEVICES'
in
os
.
environ
:
dev
.
_dtype
=
DeviceType
.
XPU
visible_devices
=
os
.
getenv
(
"XPU_VISIBLE_DEVICES"
)
...
...
@@ -151,8 +150,7 @@ class Device(object):
elif
fluid
.
core
.
is_compiled_with_cuda
():
dev
.
_dtype
=
DeviceType
.
GPU
num
=
fluid
.
core
.
get_cuda_device_count
()
visible_devices
=
os
.
getenv
(
"CUDA_VISIBLE_DEVICES"
)
or
os
.
getenv
(
"NVIDIA_VISIBLE_DEVICES"
)
visible_devices
=
os
.
getenv
(
"CUDA_VISIBLE_DEVICES"
)
elif
fluid
.
core
.
is_compiled_with_xpu
():
dev
.
_dtype
=
DeviceType
.
XPU
num
=
fluid
.
core
.
get_xpu_device_count
()
...
...
python/paddle/distributed/launch/controllers/collective.py
浏览文件 @
c91aaced
...
...
@@ -108,7 +108,9 @@ class CollectiveController(Controller):
else
:
e
.
update
({
'PADDLE_DISTRI_BACKEND'
:
'gloo'
})
self
.
add_container
(
envs
=
e
,
log_tag
=
i
)
# log_file = "{}.{}.{}.log".format(self.job.id, self.pod.name, i)
log_file
=
f
"workerlog.
{
i
}
"
self
.
add_container
(
envs
=
e
,
log_file
=
log_file
)
return
True
...
...
python/paddle/distributed/launch/controllers/controller.py
浏览文件 @
c91aaced
...
...
@@ -92,6 +92,9 @@ class ControllerBase(object):
self
.
master
.
set_status
(
status
)
while
self
.
pod
.
logs
():
pass
self
.
ctx
.
logger
.
info
(
"Pod {}"
.
format
(
status
))
return
True
...
...
@@ -105,6 +108,9 @@ class ControllerBase(object):
fc
=
self
.
pod
.
failed_container
()
self
.
ctx
.
logger
.
info
(
"Pod {}"
.
format
(
status
))
self
.
ctx
.
logger
.
error
(
"Container failed !!!
\n
{}"
.
format
(
fc
[
0
]))
self
.
ctx
.
logger
.
info
(
"------------------------- ERROR LOG DETAIL -------------------------"
)
fc
[
0
].
tail
()
if
self
.
ctx
.
args
.
elastic_level
<=
0
:
...
...
@@ -203,13 +209,8 @@ class Controller(ControllerBase):
container
=
None
,
entrypoint
=
None
,
envs
=
{},
log_
tag
=
None
,
log_
file
=
None
,
is_init
=
False
):
if
not
is_init
and
log_tag
is
not
None
:
log_file
=
"{}.{}.{}.log"
.
format
(
self
.
job
.
id
,
self
.
pod
.
name
,
log_tag
)
else
:
log_file
=
None
if
not
container
:
container
=
self
.
new_container
(
entrypoint
=
entrypoint
,
...
...
python/paddle/distributed/launch/controllers/ps.py
浏览文件 @
c91aaced
...
...
@@ -84,8 +84,8 @@ class PSController(Controller):
"POD_IP"
:
self
.
ctx
.
node
.
ip
,
}
e
.
update
(
_gloo_envs
)
log_
tag
=
"ps
.{}"
.
format
(
i
)
self
.
add_container
(
envs
=
e
,
log_
tag
=
log_tag
)
log_
file
=
"serverlog
.{}"
.
format
(
i
)
self
.
add_container
(
envs
=
e
,
log_
file
=
log_file
)
trainer_rank_offset
=
0
for
s
in
trainer_endpoints
:
...
...
@@ -106,8 +106,8 @@ class PSController(Controller):
"POD_IP"
:
self
.
ctx
.
node
.
ip
,
}
e
.
update
(
_gloo_envs
)
log_
tag
=
"trainer
.{}"
.
format
(
i
)
self
.
add_container
(
envs
=
e
,
log_
tag
=
log_tag
)
log_
file
=
"workerlog
.{}"
.
format
(
i
)
self
.
add_container
(
envs
=
e
,
log_
file
=
log_file
)
def
_build_pod_with_master
(
self
):
...
...
@@ -191,8 +191,8 @@ class PSController(Controller):
self
.
ctx
.
node
.
ip
,
}
e
.
update
(
_gloo_envs
)
log_
tag
=
"ps
.{}"
.
format
(
i
)
self
.
add_container
(
envs
=
e
,
log_
tag
=
log_tag
)
log_
file
=
"serverlog
.{}"
.
format
(
i
)
self
.
add_container
(
envs
=
e
,
log_
file
=
log_file
)
for
i
in
range
(
trainer_num
):
e
=
{
...
...
@@ -216,8 +216,8 @@ class PSController(Controller):
self
.
ctx
.
node
.
ip
,
}
e
.
update
(
_gloo_envs
)
log_
tag
=
"trainer
.{}"
.
format
(
i
)
self
.
add_container
(
envs
=
e
,
log_
tag
=
log_tag
)
log_
file
=
"workerlog
.{}"
.
format
(
i
)
self
.
add_container
(
envs
=
e
,
log_
file
=
log_file
)
''' NEW VERSION
for i in range(server_num):
e = {
...
...
python/paddle/distributed/launch/job/container.py
浏览文件 @
c91aaced
...
...
@@ -99,7 +99,7 @@ class Container(object):
d
=
os
.
path
.
dirname
(
pth
)
if
not
os
.
path
.
isdir
(
d
):
os
.
makedirs
(
d
,
exist_ok
=
True
)
return
open
(
pth
,
'
w
'
)
return
open
(
pth
,
'
a
'
)
except
:
return
None
...
...
@@ -115,11 +115,17 @@ class Container(object):
elif
self
.
_err
:
self
.
_stderr
=
self
.
_get_fd
(
self
.
_err
)
or
sys
.
stderr
if
not
self
.
_log_handler
:
self
.
_log_handler
=
open
(
self
.
_out
)
self
.
_log_handler
.
seek
(
0
,
2
)
self
.
_log_start_offset
=
self
.
_log_handler
.
tell
()
self
.
_proc
=
ProcessContext
(
self
.
_entrypoint
,
env
=
self
.
_env
,
out
=
self
.
_stdout
,
err
=
self
.
_stderr
,
shell
=
self
.
_shell
)
self
.
_proc
.
start
()
def
terminate
(
self
,
force
=
False
):
...
...
@@ -171,13 +177,16 @@ class Container(object):
try
:
if
offset
!=
0
or
whence
!=
1
:
if
whence
==
0
and
offset
<
self
.
_log_start_offset
:
offset
=
self
.
_log_start_offset
self
.
_log_handler
.
seek
(
offset
,
whence
)
for
_
in
range
(
limit
):
line
=
self
.
_log_handler
.
readline
()
if
not
line
:
break
return
False
fn
.
write
(
line
)
return
True
except
:
return
...
...
python/paddle/fluid/tests/unittests/test_run.py
浏览文件 @
c91aaced
...
...
@@ -52,8 +52,8 @@ def write_file(name, ct):
def
get_files
(
pth
,
prefix
):
return
[
f
for
f
in
listdir
(
pth
)
if
isfile
(
join
(
pth
,
f
))
and
f
.
startswith
(
prefix
)
and
f
!=
f
"
{
prefix
}
.gpu.log"
f
for
f
in
listdir
(
pth
)
if
isfile
(
join
(
pth
,
f
))
and
not
f
.
endswith
(
'gpu.log'
)
]
...
...
@@ -101,17 +101,19 @@ class Collective_Test(unittest.TestCase):
def
test_collective_3
(
self
):
log_dir
=
tempfile
.
TemporaryDirectory
()
port
=
random
.
randrange
(
6000
,
8000
)
args
=
"--job_id test3 --devices 0,1 --log_dir {} --master 127.0.0.1:{} --nnodes 2"
.
format
(
log_dir
.
name
,
port
)
p1
=
self
.
pdrun
(
args
)
p2
=
self
.
pdrun
(
args
)
args
=
"--job_id test3 --devices 0,1 --log_dir {} --master 127.0.0.1:{} --nnodes 2"
p1
=
self
.
pdrun
(
args
.
format
(
log_dir
.
name
+
"/1"
,
port
))
p2
=
self
.
pdrun
(
args
.
format
(
log_dir
.
name
+
"/2"
,
port
))
p1
.
wait
()
p2
.
wait
()
self
.
assertTrue
(
p1
.
poll
()
==
0
)
self
.
assertTrue
(
p2
.
poll
()
==
0
)
c
=
get_files
(
log_dir
.
name
,
'test3'
)
self
.
assertTrue
(
len
(
c
)
==
6
)
c1
=
get_files
(
log_dir
.
name
+
"/1"
,
'test3'
)
c2
=
get_files
(
log_dir
.
name
+
"/2"
,
'test3'
)
print
(
c1
)
self
.
assertTrue
(
len
(
c1
)
==
3
)
self
.
assertTrue
(
len
(
c2
)
==
3
)
log_dir
.
cleanup
()
...
...
@@ -156,17 +158,19 @@ class PS_Test(unittest.TestCase):
def
test_ps_3
(
self
):
log_dir
=
tempfile
.
TemporaryDirectory
()
port
=
random
.
randrange
(
6000
,
8000
)
args
=
"--job_id ps3 --log_dir {} --master 127.0.0.1:{} --nnodes 2 --server_num=1 --trainer_num=1"
.
format
(
log_dir
.
name
,
port
)
p1
=
self
.
pdrun
(
args
)
p2
=
self
.
pdrun
(
args
)
args
=
"--job_id ps3 --log_dir {} --master 127.0.0.1:{} --nnodes 2 --server_num=1 --trainer_num=1"
p1
=
self
.
pdrun
(
args
.
format
(
log_dir
.
name
+
"/1"
,
port
))
p2
=
self
.
pdrun
(
args
.
format
(
log_dir
.
name
+
"/2"
,
port
))
p1
.
wait
()
p2
.
wait
()
self
.
assertTrue
(
p1
.
poll
()
==
0
)
self
.
assertTrue
(
p2
.
poll
()
==
0
)
c
=
get_files
(
log_dir
.
name
,
'ps3'
)
self
.
assertTrue
(
len
(
c
)
==
6
)
c1
=
get_files
(
log_dir
.
name
+
"/1"
,
'ps3'
)
c2
=
get_files
(
log_dir
.
name
+
"/2"
,
'ps3'
)
print
(
c1
)
self
.
assertTrue
(
len
(
c1
)
==
3
)
self
.
assertTrue
(
len
(
c2
)
==
3
)
log_dir
.
cleanup
()
def
test_ps_4
(
self
):
...
...
@@ -178,6 +182,7 @@ class PS_Test(unittest.TestCase):
self
.
assertTrue
(
p1
.
poll
()
==
0
)
c
=
get_files
(
log_dir
.
name
,
'ps4'
)
print
(
c
)
self
.
assertTrue
(
len
(
c
)
==
5
)
log_dir
.
cleanup
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录