Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
f5aca8fb
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
f5aca8fb
编写于
1月 21, 2021
作者:
G
gongweibao
提交者:
GitHub
1月 21, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Pass device_ids info from launch to trainer. (#30632)
Pass device_ids info from launch to trainer
上级
d2404da7
变更
7
显示空白变更内容
内联
并排
Showing
7 changed file
with
49 addition
and
12 deletion
+49
-12
python/paddle/distributed/fleet/__init__.py
python/paddle/distributed/fleet/__init__.py
+7
-1
python/paddle/distributed/fleet/base/fleet_base.py
python/paddle/distributed/fleet/base/fleet_base.py
+8
-2
python/paddle/distributed/fleet/base/role_maker.py
python/paddle/distributed/fleet/base/role_maker.py
+15
-3
python/paddle/distributed/fleet/launch_utils.py
python/paddle/distributed/fleet/launch_utils.py
+12
-1
python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py
...tributed/fleet/meta_optimizers/ascend/ascend_optimizer.py
+1
-1
python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py
.../fluid/tests/unittests/ascend_multi_process_collective.py
+4
-2
python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh
.../paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh
+2
-2
未找到文件。
python/paddle/distributed/fleet/__init__.py
浏览文件 @
f5aca8fb
...
...
@@ -44,7 +44,13 @@ node_num=fleet.node_num
rank
=
fleet
.
worker_index
nranks
=
fleet
.
worker_num
world_size
=
fleet
.
worker_num
rank_in_node
=
fleet
.
rank_in_node
# device id in current trainer
local_device_ids
=
fleet
.
local_device_ids
# device ids in world
world_device_ids
=
fleet
.
world_device_ids
# rank in node
local_rank
=
fleet
.
local_rank
rank_in_node
=
local_rank
is_worker
=
fleet
.
is_worker
worker_endpoints
=
fleet
.
worker_endpoints
server_num
=
fleet
.
server_num
...
...
python/paddle/distributed/fleet/base/fleet_base.py
浏览文件 @
f5aca8fb
...
...
@@ -291,8 +291,14 @@ class Fleet(object):
def
node_num
(
self
):
return
self
.
_role_maker
.
_get_node_num
()
def
rank_in_node
(
self
):
return
self
.
_role_maker
.
_get_rank_in_node
()
def
local_rank
(
self
):
return
self
.
_role_maker
.
_get_local_rank
()
def
local_device_ids
(
self
):
return
self
.
_role_maker
.
_get_local_device_ids
()
def
world_device_ids
(
self
):
return
self
.
_role_maker
.
_get_world_device_ids
()
def
is_worker
(
self
):
"""
...
...
python/paddle/distributed/fleet/base/role_maker.py
浏览文件 @
f5aca8fb
...
...
@@ -622,10 +622,20 @@ class PaddleCloudRoleMaker(RoleMakerBase):
self
.
_generate_role
()
return
self
.
_nodes_num
def
_get_
rank_in_node
(
self
):
def
_get_
local_rank
(
self
):
if
not
self
.
_role_is_generated
:
self
.
_generate_role
()
return
self
.
_rank_in_node
return
self
.
_local_rank
def
_get_local_device_ids
(
self
):
if
not
self
.
_role_is_generated
:
self
.
_generate_role
()
return
self
.
_local_device_ids
def
_get_world_device_ids
(
self
):
if
not
self
.
_role_is_generated
:
self
.
_generate_role
()
return
self
.
_world_device_ids
def
_get_trainer_endpoints
(
self
):
"""
...
...
@@ -787,7 +797,9 @@ class PaddleCloudRoleMaker(RoleMakerBase):
self
.
_trainers_num
=
len
(
self
.
_worker_endpoints
)
self
.
_nodes_num
=
len
(
set
([
x
.
split
(
':'
)[
0
]
for
x
in
self
.
_worker_endpoints
]))
self
.
_rank_in_node
=
os
.
getenv
(
"PADDLE_RANK_IN_NODE"
)
self
.
_local_rank
=
os
.
getenv
(
"PADDLE_RANK_IN_NODE"
)
self
.
_local_device_ids
=
os
.
getenv
(
"PADDLE_LOCAL_DEVICE_IDS"
)
self
.
_world_device_ids
=
os
.
getenv
(
"PADDLE_WORLD_DEVICE_IDS"
)
def
_gloo_init
(
self
):
# PADDLE_WITH_GLOO 1: trainer barrier, 2: all barrier
...
...
python/paddle/distributed/fleet/launch_utils.py
浏览文件 @
f5aca8fb
...
...
@@ -98,6 +98,13 @@ class Cluster(object):
r
.
append
(
t
.
endpoint
)
return
r
def
world_device_ids
(
self
):
r
=
[]
for
pod
in
self
.
pods
:
for
t
in
pod
.
trainers
:
r
.
append
(
t
.
accelerators
)
return
r
def
pods_endpoints
(
self
):
r
=
[]
for
pod
in
self
.
pods
:
...
...
@@ -452,6 +459,8 @@ def start_local_trainers(cluster,
current_env
.
pop
(
"http_proxy"
,
None
)
current_env
.
pop
(
"https_proxy"
,
None
)
ids
=
cluster
.
world_device_ids
()
res
=
[
':'
.
join
(
ele
)
for
ele
in
ids
]
procs
=
[]
for
idx
,
t
in
enumerate
(
pod
.
trainers
):
proc_env
=
{
...
...
@@ -459,7 +468,9 @@ def start_local_trainers(cluster,
"PADDLE_CURRENT_ENDPOINT"
:
"%s"
%
t
.
endpoint
,
"PADDLE_TRAINERS_NUM"
:
"%d"
%
cluster
.
trainers_nranks
(),
"PADDLE_TRAINER_ENDPOINTS"
:
","
.
join
(
cluster
.
trainers_endpoints
()),
"PADDLE_RANK_IN_NODE"
:
str
(
idx
)
"PADDLE_RANK_IN_NODE"
:
str
(
idx
),
"PADDLE_LOCAL_DEVICE_IDS"
:
","
.
join
(
t
.
accelerators
),
"PADDLE_WORLD_DEVICE_IDS"
:
","
.
join
(
res
),
}
if
len
(
t
.
accelerators
)
>
0
and
pod
.
device_mode
==
DeviceMode
.
GPU
:
...
...
python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py
浏览文件 @
f5aca8fb
...
...
@@ -150,7 +150,7 @@ class AscendOptimizer(Optimizer):
# Config about Graph Engine can be found in https://support.huaweicloud.com/
config
=
{
"ge.exec.deviceId"
:
str
(
fleet
.
rank_in_node
()),
"ge.exec.deviceId"
:
str
(
fleet
.
local_device_ids
()),
"ge.graphRunMode"
:
"1"
,
"ge.exec.precision_mode"
:
"must_keep_origin_dtype"
,
# if multi mode
...
...
python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py
浏览文件 @
f5aca8fb
...
...
@@ -23,9 +23,11 @@ def train(prefix):
current_endpoint
=
os
.
getenv
(
"PADDLE_CURRENT_ENDPOINT"
)
worker_endpoints
=
worker_endpoints_env
trainers_num
=
len
(
worker_endpoints
.
split
(
','
))
device_ids
=
os
.
getenv
(
"PADDLE_WORLD_DEVICE_IDS"
)
current_device_id
=
os
.
getenv
(
"PADDLE_LOCAL_DEVICE_IDS"
)
details
=
"selected_accelerators:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"
\
.
format
(
selected_accelerators
,
worker_endpoints
,
trainers_num
,
current_endpoint
,
trainer_id
)
details
=
"selected_accelerators:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}
device_ids:{} device_id:{}
"
\
.
format
(
selected_accelerators
,
worker_endpoints
,
trainers_num
,
current_endpoint
,
trainer_id
,
device_ids
,
current_device_id
)
print
(
details
)
with
open
(
"multi_process_{}.check_{}.log"
.
format
(
prefix
,
trainer_id
),
"w"
)
as
f
:
...
...
python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh
浏览文件 @
f5aca8fb
...
...
@@ -30,8 +30,8 @@ export TRAINER_PORTS_NUM=2
distributed_args
=
"--ips=
${
cluster_node_ips
}
--ascend_npus=0,1 --log_dir=testlog"
python
-m
paddle.distributed.fleet.launch
${
distributed_args
}
ascend_multi_process_collective.py fleetlaunchascend
str1
=
"selected_accelerators:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0"
str2
=
"selected_accelerators:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1"
str1
=
"selected_accelerators:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0
device_ids:0,1,0,1 device_id:0
"
str2
=
"selected_accelerators:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1
device_ids:0,1,0,1 device_id:1
"
file_0
=
"multi_process_fleetlaunchascend.check_0.log"
file_1
=
"multi_process_fleetlaunchascend.check_1.log"
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录