Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
8562668e
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
8562668e
编写于
3月 24, 2022
作者:
K
kuizhiqing
提交者:
GitHub
3月 24, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix device id env (#40844)
上级
1d60e819
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
28 addition
and
21 deletion
+28
-21
python/paddle/distributed/fleet/launch.py
python/paddle/distributed/fleet/launch.py
+2
-1
python/paddle/distributed/launch/context/__init__.py
python/paddle/distributed/launch/context/__init__.py
+3
-2
python/paddle/distributed/launch/context/device.py
python/paddle/distributed/launch/context/device.py
+9
-13
python/paddle/distributed/launch/controllers/collective.py
python/paddle/distributed/launch/controllers/collective.py
+5
-2
python/paddle/distributed/launch/controllers/controller.py
python/paddle/distributed/launch/controllers/controller.py
+2
-0
python/paddle/distributed/launch/plugins/__init__.py
python/paddle/distributed/launch/plugins/__init__.py
+3
-2
python/paddle/fluid/tests/unittests/test_run.py
python/paddle/fluid/tests/unittests/test_run.py
+4
-1
未找到文件。
python/paddle/distributed/fleet/launch.py
浏览文件 @
8562668e
...
...
@@ -242,7 +242,8 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
elastic_group
.
add_argument
(
"--force"
,
type
=
bool
,
default
=
False
,
help
=
"update np force"
)
return
parser
.
parse_args
()
known_args
,
_
=
parser
.
parse_known_args
()
return
known_args
def
get_cluster_from_args
(
args
,
device_mode
,
devices_per_proc
):
...
...
python/paddle/distributed/launch/context/__init__.py
浏览文件 @
8562668e
...
...
@@ -25,12 +25,13 @@ class Context(object):
def
__init__
(
self
,
enable_plugin
=
True
):
self
.
args
,
self
.
unknown_args
=
parse_args
()
self
.
envs
=
fetch_envs
()
self
.
logger
=
self
.
get_logger
()
self
.
set_env_in_args
()
self
.
node
=
Node
()
self
.
status
=
Status
()
self
.
set_env_in_args
()
self
.
logger
=
self
.
get_logger
()
# design for event queue, later
self
.
events
=
[]
...
...
python/paddle/distributed/launch/context/device.py
浏览文件 @
8562668e
...
...
@@ -57,7 +57,7 @@ class Device(object):
else
:
self
.
_labels
=
[]
def
get_selected_
flag
_key
(
self
):
def
get_selected_
device
_key
(
self
):
if
self
.
_dtype
==
DeviceType
.
CPU
:
return
'FLAGS_selected_cpus'
if
self
.
_dtype
==
DeviceType
.
GPU
:
...
...
@@ -70,19 +70,15 @@ class Device(object):
return
'FLAGS_selected_mlus'
return
'FLAGS_selected_devices'
def
get_selected_flag_label
(
self
,
idx
):
if
idx
<
len
(
self
.
_labels
):
return
self
.
_labels
[
idx
]
def
get_selected_devices
(
self
,
devices
=
''
):
'''
return the device label/id relative to the visible devices
'''
if
not
devices
:
return
[
str
(
x
)
for
x
in
range
(
0
,
len
(
self
.
_labels
))]
else
:
return
'0'
def
selected_flags
(
self
,
idx
=
None
):
if
idx
is
None
:
return
{
self
.
get_selected_flag_key
():
','
.
join
(
self
.
_labels
)}
else
:
return
{
self
.
get_selected_flag_key
():
self
.
get_selected_flag_label
(
idx
)
}
devs
=
[
x
.
strip
()
for
x
in
devices
.
split
(
','
)]
return
[
str
(
self
.
_labels
.
index
(
d
))
for
d
in
devs
]
@
classmethod
def
parse_device
(
self
):
...
...
python/paddle/distributed/launch/controllers/collective.py
浏览文件 @
8562668e
...
...
@@ -75,6 +75,9 @@ class CollectiveController(Controller):
job_endpoints
=
[
i
[
'endpoints'
]
for
i
in
peer_list
]
self
.
pod
.
reset
()
selected_dev_key
=
self
.
ctx
.
node
.
device
.
get_selected_device_key
()
selected_dev_list
=
self
.
ctx
.
node
.
device
.
get_selected_devices
(
self
.
ctx
.
args
.
devices
)
for
i
in
range
(
self
.
pod
.
replicas
):
e
=
{
"PADDLE_MASTER"
:
collective_master
,
...
...
@@ -90,9 +93,9 @@ class CollectiveController(Controller):
"PADDLE_RANK_IN_NODE"
:
str
(
i
),
}
if
self
.
pod
.
replicas
==
1
:
e
.
update
(
self
.
ctx
.
node
.
device
.
selected_flags
()
)
e
.
update
(
{
selected_dev_key
:
selected_dev_list
}
)
else
:
e
.
update
(
self
.
ctx
.
node
.
device
.
selected_flags
(
i
)
)
e
.
update
(
{
selected_dev_key
:
selected_dev_list
[
i
]}
)
self
.
add_container
(
envs
=
e
,
log_tag
=
i
)
return
True
...
...
python/paddle/distributed/launch/controllers/controller.py
浏览文件 @
8562668e
...
...
@@ -210,6 +210,8 @@ class Controller(ControllerBase):
if
self
.
ctx
.
args
.
nproc_per_node
:
return
int
(
self
.
ctx
.
args
.
nproc_per_node
)
elif
self
.
ctx
.
args
.
devices
:
return
len
(
self
.
ctx
.
args
.
devices
.
split
(
','
))
else
:
return
self
.
ctx
.
node
.
device
.
count
...
...
python/paddle/distributed/launch/plugins/__init__.py
浏览文件 @
8562668e
...
...
@@ -29,8 +29,9 @@ def process_args(ctx):
#argdev = ctx.args.gpus or ctx.args.xpus or ctx.args.npus
argdev
=
ctx
.
args
.
devices
if
argdev
:
ctx
.
node
.
device
.
labels
=
argdev
.
split
(
','
)
ctx
.
logger
.
debug
(
'Device reset by args {}'
.
format
(
argdev
))
for
d
in
argdev
.
split
(
','
):
assert
d
in
ctx
.
node
.
device
.
labels
,
'Device not found {}'
.
format
(
argdev
)
def
collective_compatible
(
ctx
):
...
...
python/paddle/fluid/tests/unittests/test_run.py
浏览文件 @
8562668e
...
...
@@ -64,7 +64,10 @@ class Collective_Test(unittest.TestCase):
if
args
:
cmd
.
extend
(
args
.
split
(
" "
))
cmd
.
extend
([
pyname
])
proc
=
subprocess
.
Popen
(
cmd
,
env
)
env
=
os
.
environ
.
copy
()
# virtual devies for testing
env
.
update
({
'CUDA_VISIBLE_DEVICES'
:
'0,1,2,3,4,5,6,7'
})
proc
=
subprocess
.
Popen
(
cmd
,
env
=
env
)
return
proc
def
test_collective_1
(
self
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录