Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
8562668e
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2299
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
8562668e
编写于
3月 24, 2022
作者:
K
kuizhiqing
提交者:
GitHub
3月 24, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix device id env (#40844)
上级
1d60e819
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
28 addition
and
21 deletion
+28
-21
python/paddle/distributed/fleet/launch.py
python/paddle/distributed/fleet/launch.py
+2
-1
python/paddle/distributed/launch/context/__init__.py
python/paddle/distributed/launch/context/__init__.py
+3
-2
python/paddle/distributed/launch/context/device.py
python/paddle/distributed/launch/context/device.py
+9
-13
python/paddle/distributed/launch/controllers/collective.py
python/paddle/distributed/launch/controllers/collective.py
+5
-2
python/paddle/distributed/launch/controllers/controller.py
python/paddle/distributed/launch/controllers/controller.py
+2
-0
python/paddle/distributed/launch/plugins/__init__.py
python/paddle/distributed/launch/plugins/__init__.py
+3
-2
python/paddle/fluid/tests/unittests/test_run.py
python/paddle/fluid/tests/unittests/test_run.py
+4
-1
未找到文件。
python/paddle/distributed/fleet/launch.py
浏览文件 @
8562668e
...
@@ -242,7 +242,8 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
...
@@ -242,7 +242,8 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
elastic_group
.
add_argument
(
elastic_group
.
add_argument
(
"--force"
,
type
=
bool
,
default
=
False
,
help
=
"update np force"
)
"--force"
,
type
=
bool
,
default
=
False
,
help
=
"update np force"
)
return
parser
.
parse_args
()
known_args
,
_
=
parser
.
parse_known_args
()
return
known_args
def
get_cluster_from_args
(
args
,
device_mode
,
devices_per_proc
):
def
get_cluster_from_args
(
args
,
device_mode
,
devices_per_proc
):
...
...
python/paddle/distributed/launch/context/__init__.py
浏览文件 @
8562668e
...
@@ -25,12 +25,13 @@ class Context(object):
...
@@ -25,12 +25,13 @@ class Context(object):
def
__init__
(
self
,
enable_plugin
=
True
):
def
__init__
(
self
,
enable_plugin
=
True
):
self
.
args
,
self
.
unknown_args
=
parse_args
()
self
.
args
,
self
.
unknown_args
=
parse_args
()
self
.
envs
=
fetch_envs
()
self
.
envs
=
fetch_envs
()
self
.
logger
=
self
.
get_logger
()
self
.
set_env_in_args
()
self
.
node
=
Node
()
self
.
node
=
Node
()
self
.
status
=
Status
()
self
.
status
=
Status
()
self
.
set_env_in_args
()
self
.
logger
=
self
.
get_logger
()
# design for event queue, later
# design for event queue, later
self
.
events
=
[]
self
.
events
=
[]
...
...
python/paddle/distributed/launch/context/device.py
浏览文件 @
8562668e
...
@@ -57,7 +57,7 @@ class Device(object):
...
@@ -57,7 +57,7 @@ class Device(object):
else
:
else
:
self
.
_labels
=
[]
self
.
_labels
=
[]
def
get_selected_
flag
_key
(
self
):
def
get_selected_
device
_key
(
self
):
if
self
.
_dtype
==
DeviceType
.
CPU
:
if
self
.
_dtype
==
DeviceType
.
CPU
:
return
'FLAGS_selected_cpus'
return
'FLAGS_selected_cpus'
if
self
.
_dtype
==
DeviceType
.
GPU
:
if
self
.
_dtype
==
DeviceType
.
GPU
:
...
@@ -70,19 +70,15 @@ class Device(object):
...
@@ -70,19 +70,15 @@ class Device(object):
return
'FLAGS_selected_mlus'
return
'FLAGS_selected_mlus'
return
'FLAGS_selected_devices'
return
'FLAGS_selected_devices'
def
get_selected_flag_label
(
self
,
idx
):
def
get_selected_devices
(
self
,
devices
=
''
):
if
idx
<
len
(
self
.
_labels
):
'''
return
self
.
_labels
[
idx
]
return the device label/id relative to the visible devices
'''
if
not
devices
:
return
[
str
(
x
)
for
x
in
range
(
0
,
len
(
self
.
_labels
))]
else
:
else
:
return
'0'
devs
=
[
x
.
strip
()
for
x
in
devices
.
split
(
','
)]
return
[
str
(
self
.
_labels
.
index
(
d
))
for
d
in
devs
]
def
selected_flags
(
self
,
idx
=
None
):
if
idx
is
None
:
return
{
self
.
get_selected_flag_key
():
','
.
join
(
self
.
_labels
)}
else
:
return
{
self
.
get_selected_flag_key
():
self
.
get_selected_flag_label
(
idx
)
}
@
classmethod
@
classmethod
def
parse_device
(
self
):
def
parse_device
(
self
):
...
...
python/paddle/distributed/launch/controllers/collective.py
浏览文件 @
8562668e
...
@@ -75,6 +75,9 @@ class CollectiveController(Controller):
...
@@ -75,6 +75,9 @@ class CollectiveController(Controller):
job_endpoints
=
[
i
[
'endpoints'
]
for
i
in
peer_list
]
job_endpoints
=
[
i
[
'endpoints'
]
for
i
in
peer_list
]
self
.
pod
.
reset
()
self
.
pod
.
reset
()
selected_dev_key
=
self
.
ctx
.
node
.
device
.
get_selected_device_key
()
selected_dev_list
=
self
.
ctx
.
node
.
device
.
get_selected_devices
(
self
.
ctx
.
args
.
devices
)
for
i
in
range
(
self
.
pod
.
replicas
):
for
i
in
range
(
self
.
pod
.
replicas
):
e
=
{
e
=
{
"PADDLE_MASTER"
:
collective_master
,
"PADDLE_MASTER"
:
collective_master
,
...
@@ -90,9 +93,9 @@ class CollectiveController(Controller):
...
@@ -90,9 +93,9 @@ class CollectiveController(Controller):
"PADDLE_RANK_IN_NODE"
:
str
(
i
),
"PADDLE_RANK_IN_NODE"
:
str
(
i
),
}
}
if
self
.
pod
.
replicas
==
1
:
if
self
.
pod
.
replicas
==
1
:
e
.
update
(
self
.
ctx
.
node
.
device
.
selected_flags
()
)
e
.
update
(
{
selected_dev_key
:
selected_dev_list
}
)
else
:
else
:
e
.
update
(
self
.
ctx
.
node
.
device
.
selected_flags
(
i
)
)
e
.
update
(
{
selected_dev_key
:
selected_dev_list
[
i
]}
)
self
.
add_container
(
envs
=
e
,
log_tag
=
i
)
self
.
add_container
(
envs
=
e
,
log_tag
=
i
)
return
True
return
True
...
...
python/paddle/distributed/launch/controllers/controller.py
浏览文件 @
8562668e
...
@@ -210,6 +210,8 @@ class Controller(ControllerBase):
...
@@ -210,6 +210,8 @@ class Controller(ControllerBase):
if
self
.
ctx
.
args
.
nproc_per_node
:
if
self
.
ctx
.
args
.
nproc_per_node
:
return
int
(
self
.
ctx
.
args
.
nproc_per_node
)
return
int
(
self
.
ctx
.
args
.
nproc_per_node
)
elif
self
.
ctx
.
args
.
devices
:
return
len
(
self
.
ctx
.
args
.
devices
.
split
(
','
))
else
:
else
:
return
self
.
ctx
.
node
.
device
.
count
return
self
.
ctx
.
node
.
device
.
count
...
...
python/paddle/distributed/launch/plugins/__init__.py
浏览文件 @
8562668e
...
@@ -29,8 +29,9 @@ def process_args(ctx):
...
@@ -29,8 +29,9 @@ def process_args(ctx):
#argdev = ctx.args.gpus or ctx.args.xpus or ctx.args.npus
#argdev = ctx.args.gpus or ctx.args.xpus or ctx.args.npus
argdev
=
ctx
.
args
.
devices
argdev
=
ctx
.
args
.
devices
if
argdev
:
if
argdev
:
ctx
.
node
.
device
.
labels
=
argdev
.
split
(
','
)
for
d
in
argdev
.
split
(
','
):
ctx
.
logger
.
debug
(
'Device reset by args {}'
.
format
(
argdev
))
assert
d
in
ctx
.
node
.
device
.
labels
,
'Device not found {}'
.
format
(
argdev
)
def
collective_compatible
(
ctx
):
def
collective_compatible
(
ctx
):
...
...
python/paddle/fluid/tests/unittests/test_run.py
浏览文件 @
8562668e
...
@@ -64,7 +64,10 @@ class Collective_Test(unittest.TestCase):
...
@@ -64,7 +64,10 @@ class Collective_Test(unittest.TestCase):
if
args
:
if
args
:
cmd
.
extend
(
args
.
split
(
" "
))
cmd
.
extend
(
args
.
split
(
" "
))
cmd
.
extend
([
pyname
])
cmd
.
extend
([
pyname
])
proc
=
subprocess
.
Popen
(
cmd
,
env
)
env
=
os
.
environ
.
copy
()
# virtual devies for testing
env
.
update
({
'CUDA_VISIBLE_DEVICES'
:
'0,1,2,3,4,5,6,7'
})
proc
=
subprocess
.
Popen
(
cmd
,
env
=
env
)
return
proc
return
proc
def
test_collective_1
(
self
):
def
test_collective_1
(
self
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录