Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
48e9d60a
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
48e9d60a
编写于
7月 06, 2020
作者:
M
mindspore-ci-bot
提交者:
Gitee
7月 06, 2020
浏览文件
操作
浏览文件
下载
差异文件
!2808 add_hccl_tools
Merge pull request !2808 from Guomenghao319/add_hccl_tools
上级
f92c4a53
ab90f30a
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
179 addition
and
0 deletion
+179
-0
model_zoo/utils/hccl_tools/README.md
model_zoo/utils/hccl_tools/README.md
+14
-0
model_zoo/utils/hccl_tools/hccl_tools.py
model_zoo/utils/hccl_tools/hccl_tools.py
+165
-0
未找到文件。
model_zoo/utils/hccl_tools/README.md
0 → 100644
浏览文件 @
48e9d60a
# description
mindspore distributed training launch helper utilty that will generate hccl config file.
# use
```
python hccl_tools.py --device_num [1,8]
```
output:
```
hccl_[device_num]p_[which device]_[server_ip].json
```
\ No newline at end of file
model_zoo/utils/hccl_tools/hccl_tools.py
0 → 100644
浏览文件 @
48e9d60a
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""generate hccl config file script"""
import
os
import
sys
import
json
import
socket
import
platform
from
argparse
import
ArgumentParser
from
typing
import
Dict
,
Any
def
parse_args
():
"""
parse args .
Args:
Returns:
args.
Examples:
>>> parse_args()
"""
parser
=
ArgumentParser
(
description
=
"mindspore distributed training launch "
"helper utilty that will generate hccl"
" config file"
)
parser
.
add_argument
(
"--device_num"
,
type
=
str
,
default
=
"[0,8]"
,
help
=
"The number of the D chip used. please note that the D chips"
"used must be continuous, such [0,4] means to use four chips "
"0,1,2,3; [0,1] means to use chip 0; The first four chips are"
"a group, and the last four chips are a group. In addition to"
"the [0,8] chips are allowed, other cross-group such as [3,6]"
"are prohibited."
)
parser
.
add_argument
(
"--visible_devices"
,
type
=
str
,
default
=
"0,1,2,3,4,5,6,7"
,
help
=
"will use the visible devices sequentially"
)
parser
.
add_argument
(
"--server_ip"
,
type
=
str
,
default
=
""
,
help
=
"server ip"
)
args
=
parser
.
parse_args
()
return
args
def
get_host_ip
():
"""
get host ip
"""
ip
=
None
try
:
hostname
=
socket
.
gethostname
()
ip
=
socket
.
gethostbyname
(
hostname
)
except
EOFError
:
pass
return
ip
def
main
():
print
(
"start"
,
__file__
)
args
=
parse_args
()
# visible_devices
visible_devices
=
args
.
visible_devices
.
split
(
','
)
print
(
'visible_devices:{}'
.
format
(
visible_devices
))
# server_id
ip
=
get_host_ip
()
if
args
.
server_ip
:
server_id
=
args
.
server_ip
elif
ip
:
server_id
=
ip
else
:
raise
ValueError
(
"please input server ip!"
)
print
(
'server_id:{}'
.
format
(
server_id
))
# device_num
first_num
=
int
(
args
.
device_num
[
1
])
last_num
=
int
(
args
.
device_num
[
3
])
if
first_num
<
0
or
last_num
>
8
:
raise
ValueError
(
"device num {} must be in range [0,8] !"
.
format
(
args
.
device_num
))
if
first_num
>
last_num
:
raise
ValueError
(
"First num {} of device num {} must less than last num {} !"
.
format
(
first_num
,
args
.
device_num
,
last_num
))
if
first_num
<
4
:
if
last_num
>
4
:
if
first_num
==
0
and
last_num
==
8
:
pass
else
:
raise
ValueError
(
"device num {} must be in the same group of [0,4] or [4,8] !"
.
format
(
args
.
device_num
))
device_num_list
=
list
(
range
(
first_num
,
last_num
))
print
(
"device_num_list:"
,
device_num_list
)
assert
len
(
visible_devices
)
>=
len
(
device_num_list
)
# construct hccn_table
device_ips
:
Dict
[
Any
,
Any
]
=
{}
with
open
(
'/etc/hccn.conf'
,
'r'
)
as
fin
:
for
hccn_item
in
fin
.
readlines
():
if
hccn_item
.
strip
().
startswith
(
'address_'
):
device_id
,
device_ip
=
hccn_item
.
split
(
'='
)
device_id
=
device_id
.
split
(
'_'
)[
1
]
device_ips
[
device_id
]
=
device_ip
.
strip
()
arch
=
platform
.
processor
()
hccn_table
=
{
'board_id'
:
{
'aarch64'
:
'0x002f'
,
'x86_64'
:
'0x0000'
}[
arch
],
'chip_info'
:
'910'
,
'deploy_mode'
:
'lab'
,
'group_count'
:
'1'
,
'group_list'
:
[]}
instance_list
=
[]
rank_id
=
0
for
instance_id
in
device_num_list
:
instance
=
{
'devices'
:
[]}
device_id
=
visible_devices
[
instance_id
]
device_ip
=
device_ips
[
device_id
]
instance
[
'devices'
].
append
({
'device_id'
:
device_id
,
'device_ip'
:
device_ip
,
})
print
(
'rank_id:{}, device_id:{}, device_ip:{}'
.
format
(
rank_id
,
device_id
,
device_ip
))
instance
[
'rank_id'
]
=
str
(
rank_id
)
rank_id
+=
1
instance
[
'server_id'
]
=
server_id
instance_list
.
append
(
instance
)
hccn_table
[
'group_list'
].
append
({
'device_num'
:
str
(
len
(
device_num_list
)),
'server_num'
:
'1'
,
'group_name'
:
''
,
'instance_count'
:
str
(
len
(
device_num_list
)),
'instance_list'
:
instance_list
,
})
hccn_table
[
'para_plane_nic_location'
]
=
'device'
hccn_table
[
'para_plane_nic_name'
]
=
[]
for
instance_id
in
device_num_list
:
eth_id
=
visible_devices
[
instance_id
]
hccn_table
[
'para_plane_nic_name'
].
append
(
'eth{}'
.
format
(
eth_id
))
hccn_table
[
'para_plane_nic_num'
]
=
str
(
len
(
device_num_list
))
hccn_table
[
'status'
]
=
'completed'
# save hccn_table to file
table_path
=
os
.
getcwd
()
table_fn
=
os
.
path
.
join
(
table_path
,
'hccl_{}p_{}_{}.json'
.
format
(
len
(
device_num_list
),
""
.
join
(
map
(
str
,
device_num_list
)),
server_id
))
with
open
(
table_fn
,
'w'
)
as
table_fp
:
json
.
dump
(
hccn_table
,
table_fp
,
indent
=
4
)
sys
.
stdout
.
flush
()
print
(
"Completed: hccl file was save in :"
,
table_fn
)
if
__name__
==
"__main__"
:
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录