Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
74fcbd29
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
74fcbd29
编写于
7月 08, 2020
作者:
W
wandongdong
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add hccl_config
上级
fdc3a235
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
8 addition
and
73 deletion
+8
-73
model_zoo/mobilenetv2/Readme.md
model_zoo/mobilenetv2/Readme.md
+2
-2
model_zoo/mobilenetv2/scripts/run_train.sh
model_zoo/mobilenetv2/scripts/run_train.sh
+6
-5
model_zoo/mobilenetv2/src/launch.py
model_zoo/mobilenetv2/src/launch.py
+0
-66
未找到文件。
model_zoo/mobilenetv2/Readme.md
浏览文件 @
74fcbd29
...
...
@@ -60,14 +60,14 @@ Dataset used: [imagenet](http://www.image-net.org/)
### Usage
-
Ascend: sh run_train.sh Ascend [DEVICE_NUM] [
SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)
] [DATASET_PATH] [CKPT_PATH]
-
Ascend: sh run_train.sh Ascend [DEVICE_NUM] [
VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [MINDSPORE_HCCL_CONFIG_PATH
] [DATASET_PATH] [CKPT_PATH]
-
GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]
### Launch
```
# training example
Ascend: sh run_train.sh Ascend 8
192.168.0.1 0,1,2,3,4,5,6,7
~/imagenet/train/ mobilenet_199.ckpt
Ascend: sh run_train.sh Ascend 8
0,1,2,3,4,5,6,7 hccl_config.json
~/imagenet/train/ mobilenet_199.ckpt
GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 ~/imagenet/train/
```
...
...
model_zoo/mobilenetv2/scripts/run_train.sh
浏览文件 @
74fcbd29
...
...
@@ -22,14 +22,16 @@ run_ascend()
exit
1
fi
if
[
!
-d
$5
]
if
[
!
-d
$5
]
&&
[
!
-f
$5
]
then
echo
"error: DATASET_PATH=
$5
is not a directory"
echo
"error: DATASET_PATH=
$5
is not a directory
or file
"
exit
1
fi
BASEPATH
=
$(
cd
"
`
dirname
$0
`
"
||
exit
;
pwd
)
export
PYTHONPATH
=
${
BASEPATH
}
:
$PYTHONPATH
export
MINDSPORE_HCCL_CONFIG_PATH
=
$4
export
RANK_TABLE_FILE
=
$4
if
[
-d
"../train"
]
;
then
rm
-rf
../train
...
...
@@ -38,8 +40,7 @@ run_ascend()
cd
../train
||
exit
python
${
BASEPATH
}
/../src/launch.py
\
--nproc_per_node
=
$2
\
--visible_devices
=
$4
\
--server_id
=
$3
\
--visible_devices
=
$3
\
--training_script
=
${
BASEPATH
}
/../train.py
\
--dataset_path
=
$5
\
--pre_trained
=
$6
\
...
...
@@ -80,7 +81,7 @@ run_gpu()
if
[
$#
-gt
6
]
||
[
$#
-lt
4
]
then
echo
"Usage:
\n
\
Ascend: sh run_train.sh Ascend [DEVICE_NUM] [
SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)
] [DATASET_PATH] [CKPT_PATH]
\n
\
Ascend: sh run_train.sh Ascend [DEVICE_NUM] [
VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [MINDSPORE_HCCL_CONFIG_PATH
] [DATASET_PATH] [CKPT_PATH]
\n
\
GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]
\n
\
"
exit
1
...
...
model_zoo/mobilenetv2/src/launch.py
浏览文件 @
74fcbd29
...
...
@@ -15,7 +15,6 @@
"""launch train script"""
import
os
import
sys
import
json
import
subprocess
import
shutil
from
argparse
import
ArgumentParser
...
...
@@ -42,8 +41,6 @@ def parse_args():
"each process can be bound to a single D."
)
parser
.
add_argument
(
"--visible_devices"
,
type
=
str
,
default
=
"0,1,2,3,4,5,6,7"
,
help
=
"will use the visible devices sequentially"
)
parser
.
add_argument
(
"--server_id"
,
type
=
str
,
default
=
""
,
help
=
"server ip"
)
parser
.
add_argument
(
"--training_script"
,
type
=
str
,
help
=
"The full path to the single D training "
"program/script to be launched in parallel, "
...
...
@@ -63,66 +60,6 @@ def main():
assert
os
.
path
.
isfile
(
args
.
training_script
)
assert
len
(
visible_devices
)
>=
args
.
nproc_per_node
print
(
'visible_devices:{}'
.
format
(
visible_devices
))
if
not
args
.
server_id
:
print
(
'pleaser input server ip!!!'
)
exit
(
0
)
print
(
'server_id:{}'
.
format
(
args
.
server_id
))
# construct hccn_table
hccn_configs
=
open
(
'/etc/hccn.conf'
,
'r'
).
readlines
()
device_ips
=
{}
for
hccn_item
in
hccn_configs
:
hccn_item
=
hccn_item
.
strip
()
if
hccn_item
.
startswith
(
'address_'
):
device_id
,
device_ip
=
hccn_item
.
split
(
'='
)
device_id
=
device_id
.
split
(
'_'
)[
1
]
device_ips
[
device_id
]
=
device_ip
print
(
'device_id:{}, device_ip:{}'
.
format
(
device_id
,
device_ip
))
hccn_table
=
{}
hccn_table
[
'board_id'
]
=
'0x0000'
hccn_table
[
'chip_info'
]
=
'910'
hccn_table
[
'deploy_mode'
]
=
'lab'
hccn_table
[
'group_count'
]
=
'1'
hccn_table
[
'group_list'
]
=
[]
instance_list
=
[]
usable_dev
=
''
for
instance_id
in
range
(
args
.
nproc_per_node
):
instance
=
{}
instance
[
'devices'
]
=
[]
device_id
=
visible_devices
[
instance_id
]
device_ip
=
device_ips
[
device_id
]
usable_dev
+=
str
(
device_id
)
instance
[
'devices'
].
append
({
'device_id'
:
device_id
,
'device_ip'
:
device_ip
,
})
instance
[
'rank_id'
]
=
str
(
instance_id
)
instance
[
'server_id'
]
=
args
.
server_id
instance_list
.
append
(
instance
)
hccn_table
[
'group_list'
].
append
({
'device_num'
:
str
(
args
.
nproc_per_node
),
'server_num'
:
'1'
,
'group_name'
:
''
,
'instance_count'
:
str
(
args
.
nproc_per_node
),
'instance_list'
:
instance_list
,
})
hccn_table
[
'para_plane_nic_location'
]
=
'device'
hccn_table
[
'para_plane_nic_name'
]
=
[]
for
instance_id
in
range
(
args
.
nproc_per_node
):
eth_id
=
visible_devices
[
instance_id
]
hccn_table
[
'para_plane_nic_name'
].
append
(
'eth{}'
.
format
(
eth_id
))
hccn_table
[
'para_plane_nic_num'
]
=
str
(
args
.
nproc_per_node
)
hccn_table
[
'status'
]
=
'completed'
# save hccn_table to file
table_path
=
os
.
getcwd
()
if
not
os
.
path
.
exists
(
table_path
):
os
.
mkdir
(
table_path
)
table_fn
=
os
.
path
.
join
(
table_path
,
'rank_table_{}p_{}_{}.json'
.
format
(
args
.
nproc_per_node
,
usable_dev
,
args
.
server_id
))
with
open
(
table_fn
,
'w'
)
as
table_fp
:
json
.
dump
(
hccn_table
,
table_fp
,
indent
=
4
)
sys
.
stdout
.
flush
()
# spawn the processes
processes
=
[]
...
...
@@ -137,9 +74,6 @@ def main():
device_dir
=
os
.
path
.
join
(
cur_path
,
'device{}'
.
format
(
rank_id
))
env
[
'RANK_ID'
]
=
str
(
rank_id
)
env
[
'DEVICE_ID'
]
=
str
(
device_id
)
if
args
.
nproc_per_node
>
1
:
env
[
'MINDSPORE_HCCL_CONFIG_PATH'
]
=
table_fn
env
[
'RANK_TABLE_FILE'
]
=
table_fn
if
os
.
path
.
exists
(
device_dir
):
shutil
.
rmtree
(
device_dir
)
os
.
mkdir
(
device_dir
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录