Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
387c1db4
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
387c1db4
编写于
2月 25, 2021
作者:
X
xiayanming
提交者:
GitHub
2月 25, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Ascendrc (#31065)
Ascendrc
上级
ff4654e2
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
228 addition
and
162 deletion
+228
-162
python/paddle/distributed/fleet/ascend_utils.py
python/paddle/distributed/fleet/ascend_utils.py
+10
-12
python/paddle/distributed/fleet/launch.py
python/paddle/distributed/fleet/launch.py
+0
-10
python/paddle/distributed/fleet/launch_utils.py
python/paddle/distributed/fleet/launch_utils.py
+5
-13
python/paddle/fluid/tests/unittests/ascend_group.py
python/paddle/fluid/tests/unittests/ascend_group.py
+21
-2
python/paddle/fluid/tests/unittests/hccl_tools.py
python/paddle/fluid/tests/unittests/hccl_tools.py
+150
-0
python/paddle/fluid/tests/unittests/test_ascend_group.sh
python/paddle/fluid/tests/unittests/test_ascend_group.sh
+7
-8
python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh
.../paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh
+35
-14
python/paddle/fluid/tests/unittests/test_fleet_launch_ascend2.sh
...paddle/fluid/tests/unittests/test_fleet_launch_ascend2.sh
+0
-103
未找到文件。
python/paddle/distributed/fleet/ascend_utils.py
浏览文件 @
387c1db4
...
...
@@ -79,24 +79,25 @@ def _get_ascend_rankfile(rank_table_file_path):
def
get_cloud_cluster
(
rank_table_file
=
None
,
device_mode
=
DeviceMode
.
ASCEND_NPU
,
devices_per_proc
=
None
,
start_port
=
6070
):
"""
Args:
rank_table_file: string, ascend npu rank file path
device_mode: DeviceMode(Int)
devices_per_proc:list
start_port: the start port of current runtime env
"""
if
rank_table_file
:
# multi trainers
node_ips
,
device_count
=
_get_ascend_rankfile
(
rank_table_file
)
node_index
=
os
.
environ
.
get
(
"PADDLE_TRAINER_ID"
)
node_ip
=
None
if
node_index
is
None
:
_
,
node_ip
=
get_host_name_ip
()
if
len
(
node_ips
)
==
1
:
node_ip
=
node_ips
[
0
]
else
:
node_ip
=
node_ips
[
int
(
node_index
)]
node_index
=
os
.
environ
.
get
(
"PADDLE_TRAINER_ID"
)
node_ip
=
None
if
node_index
:
node_ip
=
node_ips
[
int
(
node_index
)]
else
:
_
,
node_ip
=
get_host_name_ip
()
assert
node_ip
in
node_ips
,
"Can't find your local ip {%s} in node_ips: {%s}"
\
%
(
node_ip
,
node_ips
)
...
...
@@ -105,11 +106,8 @@ def get_cloud_cluster(rank_table_file=None,
node_ips
=
[
"127.0.0.1"
]
node_ip
=
node_ips
[
0
]
device_count
=
1
devices_per_proc
=
None
if
devices_per_proc
is
None
:
devices_per_proc
=
[
str
(
x
)
for
x
in
range
(
device_count
)]
devices_per_proc
=
[
str
(
x
)
for
x
in
range
(
device_count
)]
free_ports
=
[
x
for
x
in
range
(
start_port
,
start_port
+
len
(
devices_per_proc
))
]
...
...
python/paddle/distributed/fleet/launch.py
浏览文件 @
387c1db4
...
...
@@ -124,15 +124,6 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
default
=
"collective"
,
help
=
"run mode of job, can be:collective/ps/ps-heter"
)
base_group
.
add_argument
(
"--ascend_npus"
,
type
=
str
,
default
=
None
,
help
=
"It's for ascend npu training."
"For example:"
"--ascend_npus=
\"
0,1,2,3
\"
will launch four training processes each bound to one npu."
)
base_group
.
add_argument
(
"--selected_gpus"
,
dest
=
"gpus"
)
base_group
.
add_argument
(
...
...
@@ -233,7 +224,6 @@ def launch_collective(args):
cluster
,
pod
=
ascend_utils
.
get_cloud_cluster
(
rank_table_file
=
os
.
getenv
(
"RANK_TABLE_FILE"
,
None
),
device_mode
=
device_mode
,
devices_per_proc
=
devices_per_proc
,
start_port
=
start_port
)
else
:
# trainers_num = 1 or not use paddlecloud ips="a,b"
...
...
python/paddle/distributed/fleet/launch_utils.py
浏览文件 @
387c1db4
...
...
@@ -476,6 +476,10 @@ def start_local_trainers(cluster,
if
len
(
t
.
accelerators
)
>
0
and
pod
.
device_mode
==
DeviceMode
.
GPU
:
proc_env
[
"FLAGS_selected_gpus"
]
=
"%s"
%
","
.
join
(
[
str
(
g
)
for
g
in
t
.
accelerators
])
if
len
(
t
.
accelerators
)
>
0
and
pod
.
device_mode
==
DeviceMode
.
ASCEND_NPU
:
proc_env
[
"FLAGS_selected_npus"
]
=
"%s"
%
","
.
join
(
[
str
(
g
)
for
g
in
t
.
accelerators
])
if
len
(
t
.
accelerators
)
>
0
:
proc_env
[
"FLAGS_selected_accelerators"
]
=
"%s"
%
","
.
join
(
...
...
@@ -578,16 +582,6 @@ def watch_local_trainers(procs, nranks):
return
alive
def
get_ascend_npus
(
npus
):
if
npus
is
None
:
count
=
fluid
.
core
.
NPUDevice
.
get_device_count
()
if
count
<=
0
:
return
None
ret
=
[
str
(
x
)
for
x
in
range
(
count
)]
else
:
ret
=
[
x
.
strip
()
for
x
in
npus
.
split
(
','
)]
return
ret
def
get_gpus
(
gpus
):
if
gpus
is
None
:
gpus_num
=
fluid
.
core
.
get_cuda_device_count
()
...
...
@@ -650,9 +644,7 @@ def get_device_proc_info(args):
else
:
devices_per_proc
=
gpus
elif
device_mode
==
DeviceMode
.
ASCEND_NPU
:
npus
=
get_ascend_npus
(
args
.
ascend_npus
)
assert
args
.
nproc_per_node
is
None
,
"ascend_npus need't nproc_per_node arguments"
devices_per_proc
=
npus
devices_per_proc
=
None
elif
device_mode
==
DeviceMode
.
CPU
:
if
args
.
nproc_per_node
is
None
:
devices_per_proc
=
[
0
]
...
...
python/paddle/fluid/tests/unittests/ascend_group.py
浏览文件 @
387c1db4
...
...
@@ -69,6 +69,24 @@ def init_communicator(startup_program, main_program, current_endpoint, endpoints
OP_ROLE_KEY
:
OpRole
.
Forward
,
})
# add input op for test
fill_var_name
=
"tensor@Filled"
fill_var
=
block
.
create_var
(
name
=
fill_var_name
,
shape
=
[
10
,
10
],
dtype
=
'float32'
,
persistable
=
False
,
stop_gradient
=
True
)
block
.
append_op
(
type
=
"fill_constant"
,
outputs
=
{
"Out"
:
fill_var_name
},
attrs
=
{
"shape"
:
[
10
,
10
],
"dtype"
:
fill_var
.
dtype
,
"value"
:
1.0
,
"place_type"
:
1
})
with
fluid
.
program_guard
(
main_program
):
op_type
=
"c_allreduce_sum"
data
=
fluid
.
layers
.
fill_constant
(
shape
=
[
1
],
dtype
=
'float32'
,
value
=
2.5
)
...
...
@@ -117,10 +135,11 @@ def train(world_endpoints, world_device_ids, local_device_ids,local_rank):
main_program
=
main_programs
[
local_rank
]
loss
=
Loss
(
Block
(
main_program
))
optimizer
=
ascend_optimizer
.
AscendOptimizer
(
None
,
fetch_list
=
[])
optimizer
.
minimize
(
loss
,
startup_program
,
auto_dp
=
True
)
optimizer
.
minimize
(
loss
,
startup_program
,
auto_dp
=
True
,
rank_table_file
=
os
.
getenv
(
"RANK_TABLE_FILE"
,
None
))
exe
=
paddle
.
static
.
Executor
(
paddle
.
CPUPlace
())
#
exe.run(startup_program)
exe
.
run
(
startup_program
)
exe
.
run
(
main_program
)
...
...
python/paddle/fluid/tests/unittests/hccl_tools.py
0 → 100644
浏览文件 @
387c1db4
# -*- coding:UTF-8 -*-
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""generate hccl config file script"""
import
os
import
sys
import
json
import
socket
from
argparse
import
ArgumentParser
from
typing
import
Dict
,
Any
def
parse_args
():
"""
parse args .
Args:
Returns:
args.
Examples:
>>> parse_args()
"""
parser
=
ArgumentParser
(
description
=
"mindspore distributed training launch "
"helper utilty that will generate hccl"
" config file"
)
parser
.
add_argument
(
"--device_num"
,
type
=
str
,
default
=
"[0,8)"
,
help
=
"The number of the Ascend accelerators used. please note that the Ascend accelerators"
"used must be continuous, such [0,4) means to use four chips "
"0,1,2,3; [0,1) means to use chip 0; The first four chips are"
"a group, and the last four chips are a group. In addition to"
"the [0,8) chips are allowed, other cross-group such as [3,6)"
"are prohibited."
)
parser
.
add_argument
(
"--visible_devices"
,
type
=
str
,
default
=
"0,1,2,3,4,5,6,7"
,
help
=
"will use the visible devices sequentially"
)
parser
.
add_argument
(
"--server_ip"
,
type
=
str
,
default
=
""
,
help
=
"server ip"
)
args
=
parser
.
parse_args
()
return
args
def
get_host_ip
():
"""
get host ip
"""
ip
=
None
try
:
hostname
=
socket
.
gethostname
()
ip
=
socket
.
gethostbyname
(
hostname
)
except
EOFError
:
pass
return
ip
def
main
():
print
(
"start"
,
__file__
)
args
=
parse_args
()
# visible_devices
visible_devices
=
args
.
visible_devices
.
split
(
','
)
print
(
'visible_devices:{}'
.
format
(
visible_devices
))
# server_id
ip
=
get_host_ip
()
if
args
.
server_ip
:
server_id
=
args
.
server_ip
elif
ip
:
server_id
=
ip
else
:
raise
ValueError
(
"please input server ip!"
)
print
(
'server_id:{}'
.
format
(
server_id
))
# device_num
first_num
=
int
(
args
.
device_num
[
1
])
last_num
=
int
(
args
.
device_num
[
3
])
if
first_num
<
0
or
last_num
>
8
:
raise
ValueError
(
"device num {} must be in range [0,8] !"
.
format
(
args
.
device_num
))
if
first_num
>
last_num
:
raise
ValueError
(
"First num {} of device num {} must less than last num {} !"
.
format
(
first_num
,
args
.
device_num
,
last_num
))
if
first_num
<
4
:
if
last_num
>
4
:
if
first_num
==
0
and
last_num
==
8
:
pass
else
:
raise
ValueError
(
"device num {} must be in the same group of [0,4] or [4,8] !"
.
format
(
args
.
device_num
))
device_num_list
=
list
(
range
(
first_num
,
last_num
))
print
(
"device_num_list:"
,
device_num_list
)
assert
len
(
visible_devices
)
>=
len
(
device_num_list
)
# construct hccn_table
device_ips
:
Dict
[
Any
,
Any
]
=
{}
with
open
(
'/etc/hccn.conf'
,
'r'
)
as
fin
:
for
hccn_item
in
fin
.
readlines
():
if
hccn_item
.
strip
().
startswith
(
'address_'
):
device_id
,
device_ip
=
hccn_item
.
split
(
'='
)
device_id
=
device_id
.
split
(
'_'
)[
1
]
device_ips
[
device_id
]
=
device_ip
.
strip
()
hccn_table
=
{
'version'
:
'1.0'
,
'server_count'
:
'1'
,
'server_list'
:
[]}
device_list
=
[]
rank_id
=
0
for
instance_id
in
device_num_list
:
device_id
=
visible_devices
[
instance_id
]
device_ip
=
device_ips
[
device_id
]
device
=
{
'device_id'
:
device_id
,
'device_ip'
:
device_ip
,
'rank_id'
:
str
(
rank_id
)}
print
(
'rank_id:{}, device_id:{}, device_ip:{}'
.
format
(
rank_id
,
device_id
,
device_ip
))
rank_id
+=
1
device_list
.
append
(
device
)
hccn_table
[
'server_list'
].
append
({
'server_id'
:
server_id
,
'device'
:
device_list
,
'host_nic_ip'
:
'reserve'
})
hccn_table
[
'status'
]
=
'completed'
# save hccn_table to file
table_path
=
os
.
getcwd
()
table_fn
=
os
.
path
.
join
(
table_path
,
'hccl_{}p_{}_{}.json'
.
format
(
len
(
device_num_list
),
""
.
join
(
map
(
str
,
device_num_list
)),
server_id
))
with
open
(
table_fn
,
'w'
)
as
table_fp
:
json
.
dump
(
hccn_table
,
table_fp
,
indent
=
4
)
sys
.
stdout
.
flush
()
print
(
"Completed: hccl file was save in :"
,
table_fn
)
if
__name__
==
"__main__"
:
main
()
python/paddle/fluid/tests/unittests/test_ascend_group.sh
浏览文件 @
387c1db4
...
...
@@ -16,15 +16,14 @@
set
-e
cluster_node_ips
=
"127.0.0.1"
export
PADDLE_TRAINERS_NUM
=
4
export
POD_IP
=
127.0.0.1
export
PADDLE_TRAINERS
=
127.0.0.1
export
PADDLE_TRAINER_ID
=
0
curr_host_ip
=
`
hostname
-i
`
python hccl_tools.py
--device_num
"[0,4)"
--server_ip
${
curr_host_ip
}
export
PADDLE_PORT
=
35789
export
TRAINER_PORTS_NUM
=
4
export
RANK_TABLE_FILE
=
"
${
PWD
}
/hccl_4p_0123_
${
curr_host_ip
}
.json"
distributed_args
=
"--ips=
${
cluster_node_ips
}
--ascend_npus=0,1,2,3 --log_dir=testlog"
# use ascend
echo
"begin test use ascend npu"
distributed_args
=
"--run_mode=collective --log_dir=testlog"
python
-m
paddle.distributed.fleet.launch
${
distributed_args
}
\
ascend_group.py fleetascendgroup
\ No newline at end of file
python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh
浏览文件 @
387c1db4
...
...
@@ -16,22 +16,43 @@
set
-e
# use paddlecloud
echo
"begin test use paddlecloud"
cluster_node_ips
=
"127.0.0.1,127.0.0.2"
export
PADDLE_TRAINERS_NUM
=
2
export
POD_IP
=
127.0.0.1
export
PADDLE_TRAINERS
=
127.0.0.1,127.0.0.2
export
PADDLE_TRAINER_ID
=
0
export
PADDLE_PORT
=
35789
export
TRAINER_PORTS_NUM
=
2
distributed_args
=
"--ips=
${
cluster_node_ips
}
--ascend_npus=0,1 --log_dir=testlog"
RANK_TABLE_FILE_NAME
=
"rank_table_file.json"
cat
>
${
RANK_TABLE_FILE_NAME
}
<<
EOF
{
"status": "completed",
"version": "1.0",
"server_count": "1",
"server_list": [
{
"server_id": "127.0.0.1",
"device": [
{
"device_id": "0",
"device_ip": "192.1.184.23",
"rank_id": "0"
},
{
"device_id": "1",
"device_ip": "192.2.21.93",
"rank_id": "1"
}
]
}
]
}
EOF
# set ascend rank table file env
export
RANK_TABLE_FILE
=
"
${
PWD
}
/
${
RANK_TABLE_FILE_NAME
}
"
# use ascend
echo
"begin test use ascend npu"
distributed_args
=
"--run_mode=collective --log_dir=testlog"
python
-m
paddle.distributed.fleet.launch
${
distributed_args
}
ascend_multi_process_collective.py fleetlaunchascend
str1
=
"selected_accelerators:0 worker_endpoints:127.0.0.1:
35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0 device_ids:0,1,
0,1 device_id:0"
str2
=
"selected_accelerators:1 worker_endpoints:127.0.0.1:
35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1 device_ids:0,1,
0,1 device_id:1"
str1
=
"selected_accelerators:0 worker_endpoints:127.0.0.1:
6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6170 trainer_id:0 device_ids:
0,1 device_id:0"
str2
=
"selected_accelerators:1 worker_endpoints:127.0.0.1:
6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6171 trainer_id:1 device_ids:
0,1 device_id:1"
file_0
=
"multi_process_fleetlaunchascend.check_0.log"
file_1
=
"multi_process_fleetlaunchascend.check_1.log"
...
...
python/paddle/fluid/tests/unittests/test_fleet_launch_ascend2.sh
已删除
100644 → 0
浏览文件 @
ff4654e2
#!/bin/bash
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set
-e
RANK_TABLE_FILE_NAME
=
"rank_table_file.json"
cat
>
${
RANK_TABLE_FILE_NAME
}
<<
EOF
{
"status": "completed",
"version": "1.0",
"server_count": "2",
"server_list": [
{
"server_id": "127.0.0.1",
"device": [
{
"device_id": "0",
"device_ip": "192.1.184.23",
"rank_id": "0"
},
{
"device_id": "1",
"device_ip": "192.2.21.93",
"rank_id": "1"
}
]
},
{
"server_id": "127.0.0.2",
"device": [
{
"device_id": "0",
"device_ip": "192.1.94.132",
"rank_id": "2"
},
{
"device_id": "1",
"device_ip": "192.2.94.30",
"rank_id": "3"
}
]
}
]
}
EOF
# set ascend rank table file env
export
RANK_TABLE_FILE
=
"
${
PWD
}
/
${
RANK_TABLE_FILE_NAME
}
"
# use paddlecloud
echo
"begin test use paddlecloud"
cluster_node_ips
=
"127.0.0.1,127.0.0.2"
export
PADDLE_TRAINERS_NUM
=
2
export
POD_IP
=
127.0.0.1
export
PADDLE_TRAINERS
=
127.0.0.1,127.0.0.2
export
PADDLE_TRAINER_ID
=
0
export
PADDLE_PORT
=
35789
export
TRAINER_PORTS_NUM
=
2
distributed_args
=
"--run_mode=collective --log_dir=testlog"
python
-m
paddle.distributed.fleet.launch
${
distributed_args
}
ascend_multi_process_collective.py fleetlaunchascend
str1
=
"selected_accelerators:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0 device_ids:0,1,0,1 device_id:0"
str2
=
"selected_accelerators:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1 device_ids:0,1,0,1 device_id:1"
file_0
=
"multi_process_fleetlaunchascend.check_0.log"
file_1
=
"multi_process_fleetlaunchascend.check_1.log"
echo
"paddlecloud params test"
if
grep
-q
"
$str1
"
"
$file_0
"
;
then
echo
"find trainer 0"
else
echo
"not find trainer 0"
exit
-1
fi
if
grep
-q
"
$str2
"
"
$file_1
"
;
then
echo
"find trainer 1"
else
echo
"not find trainer 1"
exit
-1
fi
# test async poll process
if
[
-f
$file_0
]
;
then
rm
$file_0
fi
if
[
-f
$file_1
]
;
then
rm
$file_1
fi
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录