Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
a6edbc47
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
a6edbc47
编写于
2月 18, 2021
作者:
X
xiayanming
提交者:
GitHub
2月 18, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
support parsing ascend rank table file (#31000)
support parsing ascend rank table file
上级
1201cd2e
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
237 addition
and
4 deletion
+237
-4
python/paddle/distributed/fleet/ascend_utils.py
python/paddle/distributed/fleet/ascend_utils.py
+122
-0
python/paddle/distributed/fleet/launch.py
python/paddle/distributed/fleet/launch.py
+9
-1
python/paddle/distributed/fleet/launch_utils.py
python/paddle/distributed/fleet/launch_utils.py
+3
-3
python/paddle/fluid/tests/unittests/test_fleet_launch_ascend2.sh
...paddle/fluid/tests/unittests/test_fleet_launch_ascend2.sh
+103
-0
未找到文件。
python/paddle/distributed/fleet/ascend_utils.py
0 → 100644
浏览文件 @
a6edbc47
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
json
import
paddle
from
paddle.distributed.fleet.launch_utils
import
get_cluster
,
logger
,
get_host_name_ip
,
DeviceMode
def
_get_ascend_rankfile
(
rank_table_file_path
):
"""
Args:
rank_table_file_path: ascend npu rank file json
{
"status": "completed",
"version": "1.0",
"server_count": "2",
"server_list": [
{
"server_id": "192.168.24.217",
"device": [
{
"device_id": "0",
"device_ip": "192.1.184.23",
"rank_id": "0"
},
{
"device_id": "1",
"device_ip": "192.2.21.93",
"rank_id": "1"
}
]
},
{
"server_id": "192.168.26.177",
"device": [
{
"device_id": "0",
"device_ip": "192.1.94.132",
"rank_id": "2"
},
{
"device_id": "1",
"device_ip": "192.2.94.30",
"rank_id": "3"
}
]
}
]
}
Returns:
node_ips: node ip list
device_count: number of npu per machine
"""
json_data
=
None
with
open
(
rank_table_file_path
)
as
json_file
:
json_data
=
json
.
load
(
json_file
)
node_ips
=
[]
device_count
=
0
server_list
=
json_data
[
'server_list'
]
for
server
in
server_list
:
node_ips
.
append
(
server
[
'server_id'
])
device_list
=
server
[
'device'
]
device_count
=
len
(
device_list
)
return
node_ips
,
device_count
def
get_cloud_cluster
(
rank_table_file
=
None
,
device_mode
=
DeviceMode
.
ASCEND_NPU
,
devices_per_proc
=
None
,
start_port
=
6070
):
"""
Args:
rank_table_file: string, ascend npu rank file path
device_mode: DeviceMode(Int)
devices_per_proc:list
start_port: the start port of current runtime env
"""
if
rank_table_file
:
# multi trainers
node_ips
,
device_count
=
_get_ascend_rankfile
(
rank_table_file
)
node_index
=
os
.
environ
.
get
(
"PADDLE_TRAINER_ID"
)
node_ip
=
None
if
node_index
is
None
:
_
,
node_ip
=
get_host_name_ip
()
else
:
node_ip
=
node_ips
[
int
(
node_index
)]
assert
node_ip
in
node_ips
,
"Can't find your local ip {%s} in node_ips: {%s}"
\
%
(
node_ip
,
node_ips
)
else
:
# single trainer (single ascend card)
node_ips
=
[
"127.0.0.1"
]
node_ip
=
node_ips
[
0
]
device_count
=
1
devices_per_proc
=
None
if
devices_per_proc
is
None
:
devices_per_proc
=
[
str
(
x
)
for
x
in
range
(
device_count
)]
free_ports
=
[
x
for
x
in
range
(
start_port
,
start_port
+
len
(
devices_per_proc
))
]
trainer_endpoints
=
[]
for
ip
in
node_ips
:
trainer_endpoints
.
append
([
"%s:%d"
%
(
ip
,
port
)
for
port
in
free_ports
])
return
get_cluster
(
node_ips
,
node_ip
,
trainer_endpoints
,
device_mode
,
devices_per_proc
)
\ No newline at end of file
python/paddle/distributed/fleet/launch.py
浏览文件 @
a6edbc47
...
...
@@ -73,6 +73,7 @@ from paddle.distributed.fleet import launch_utils
# TODO(danleifeng): Don't import * from a module
from
paddle.distributed.fleet.launch_utils
import
*
import
paddle.distributed.fleet.cloud_utils
as
cloud_utils
import
paddle.distributed.fleet.ascend_utils
as
ascend_utils
def
_print_arguments
(
args
):
...
...
@@ -129,7 +130,7 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
default
=
None
,
help
=
"It's for ascend npu training."
"For example:"
"--ascend_npus=
\"
0,1,2,3
\"
will launch four training processes each bound to one
g
pu."
"--ascend_npus=
\"
0,1,2,3
\"
will launch four training processes each bound to one
n
pu."
)
base_group
.
add_argument
(
"--selected_gpus"
,
dest
=
"gpus"
)
...
...
@@ -227,6 +228,13 @@ def launch_collective(args):
cluster
,
pod
=
cloud_utils
.
get_cloud_cluster
(
args
.
ips
,
device_mode
,
devices_per_proc
,
start_port
)
logger
.
debug
(
"get cluster from cloud:{}"
.
format
(
cluster
))
elif
device_mode
==
DeviceMode
.
ASCEND_NPU
:
# for ascend
cluster
,
pod
=
ascend_utils
.
get_cloud_cluster
(
rank_table_file
=
os
.
getenv
(
"RANK_TABLE_FILE"
,
None
),
device_mode
=
device_mode
,
devices_per_proc
=
devices_per_proc
,
start_port
=
start_port
)
else
:
# trainers_num = 1 or not use paddlecloud ips="a,b"
cluster
,
pod
=
get_cluster_from_args
(
args
,
device_mode
,
...
...
python/paddle/distributed/fleet/launch_utils.py
浏览文件 @
a6edbc47
...
...
@@ -459,7 +459,7 @@ def start_local_trainers(cluster,
current_env
.
pop
(
"http_proxy"
,
None
)
current_env
.
pop
(
"https_proxy"
,
None
)
ids
=
cluster
.
world_device_ids
()
ids
=
cluster
.
world_device_ids
()
res
=
[
':'
.
join
(
ele
)
for
ele
in
ids
]
procs
=
[]
for
idx
,
t
in
enumerate
(
pod
.
trainers
):
...
...
@@ -582,8 +582,8 @@ def get_ascend_npus(npus):
if
npus
is
None
:
count
=
fluid
.
core
.
NPUDevice
.
get_device_count
()
if
count
<=
0
:
return
ret
ret
=
[
x
for
x
in
range
(
count
)]
return
None
ret
=
[
str
(
x
)
for
x
in
range
(
count
)]
else
:
ret
=
[
x
.
strip
()
for
x
in
npus
.
split
(
','
)]
return
ret
...
...
python/paddle/fluid/tests/unittests/test_fleet_launch_ascend2.sh
0 → 100644
浏览文件 @
a6edbc47
#!/bin/bash
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set
-e
RANK_TABLE_FILE_NAME
=
"rank_table_file.json"
cat
>
${
RANK_TABLE_FILE_NAME
}
<<
EOF
{
"status": "completed",
"version": "1.0",
"server_count": "2",
"server_list": [
{
"server_id": "127.0.0.1",
"device": [
{
"device_id": "0",
"device_ip": "192.1.184.23",
"rank_id": "0"
},
{
"device_id": "1",
"device_ip": "192.2.21.93",
"rank_id": "1"
}
]
},
{
"server_id": "127.0.0.2",
"device": [
{
"device_id": "0",
"device_ip": "192.1.94.132",
"rank_id": "2"
},
{
"device_id": "1",
"device_ip": "192.2.94.30",
"rank_id": "3"
}
]
}
]
}
EOF
# set ascend rank table file env
export
RANK_TABLE_FILE
=
"
${
PWD
}
/
${
RANK_TABLE_FILE_NAME
}
"
# use paddlecloud
echo
"begin test use paddlecloud"
cluster_node_ips
=
"127.0.0.1,127.0.0.2"
export
PADDLE_TRAINERS_NUM
=
2
export
POD_IP
=
127.0.0.1
export
PADDLE_TRAINERS
=
127.0.0.1,127.0.0.2
export
PADDLE_TRAINER_ID
=
0
export
PADDLE_PORT
=
35789
export
TRAINER_PORTS_NUM
=
2
distributed_args
=
"--run_mode=collective --log_dir=testlog"
python
-m
paddle.distributed.fleet.launch
${
distributed_args
}
ascend_multi_process_collective.py fleetlaunchascend
str1
=
"selected_accelerators:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0 device_ids:0,1,0,1 device_id:0"
str2
=
"selected_accelerators:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1 device_ids:0,1,0,1 device_id:1"
file_0
=
"multi_process_fleetlaunchascend.check_0.log"
file_1
=
"multi_process_fleetlaunchascend.check_1.log"
echo
"paddlecloud params test"
if
grep
-q
"
$str1
"
"
$file_0
"
;
then
echo
"find trainer 0"
else
echo
"not find trainer 0"
exit
-1
fi
if
grep
-q
"
$str2
"
"
$file_1
"
;
then
echo
"find trainer 1"
else
echo
"not find trainer 1"
exit
-1
fi
# test async poll process
if
[
-f
$file_0
]
;
then
rm
$file_0
fi
if
[
-f
$file_1
]
;
then
rm
$file_1
fi
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录