Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
2301_77200941
mindspore
提交
f2d3fd34
M
mindspore
项目概览
2301_77200941
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
f2d3fd34
编写于
9月 07, 2020
作者:
L
lichenever
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
rectification_allreduce_fusion_api
上级
1519b881
变更
11
隐藏空白更改
内联
并排
Showing
11 changed file
with
32 addition
and
38 deletion
+32
-38
mindspore/context.py
mindspore/context.py
+4
-2
mindspore/parallel/_auto_parallel_context.py
mindspore/parallel/_auto_parallel_context.py
+7
-3
model_zoo/official/cv/mobilenetv2/src/utils.py
model_zoo/official/cv/mobilenetv2/src/utils.py
+2
-2
model_zoo/official/cv/resnet/train.py
model_zoo/official/cv/resnet/train.py
+3
-4
model_zoo/official/cv/resnet50_quant/train.py
model_zoo/official/cv/resnet50_quant/train.py
+1
-3
model_zoo/official/cv/resnet_thor/train.py
model_zoo/official/cv/resnet_thor/train.py
+2
-5
model_zoo/official/nlp/bert/run_pretrain.py
model_zoo/official/nlp/bert/run_pretrain.py
+4
-5
tests/st/networks/models/resnet50/test_resnet50_imagenet.py
tests/st/networks/models/resnet50/test_resnet50_imagenet.py
+4
-5
tests/st/tbe_networks/resnet_cifar.py
tests/st/tbe_networks/resnet_cifar.py
+2
-3
tests/st/tbe_networks/test_resnet_cifar_8p.py
tests/st/tbe_networks/test_resnet_cifar_8p.py
+1
-3
tests/ut/python/parallel/test_parallel_optimizer.py
tests/ut/python/parallel/test_parallel_optimizer.py
+2
-3
未找到文件。
mindspore/context.py
浏览文件 @
f2d3fd34
...
...
@@ -325,7 +325,8 @@ def _context():
@
args_type_check
(
device_num
=
int
,
global_rank
=
int
,
gradients_mean
=
bool
,
gradient_fp32_sync
=
bool
,
parallel_mode
=
str
,
auto_parallel_search_mode
=
str
,
parameter_broadcast
=
bool
,
strategy_ckpt_load_file
=
str
,
strategy_ckpt_save_file
=
str
,
full_batch
=
bool
,
enable_parallel_optimizer
=
bool
)
strategy_ckpt_save_file
=
str
,
full_batch
=
bool
,
enable_parallel_optimizer
=
bool
,
all_reduce_fusion_config
=
list
)
def
set_auto_parallel_context
(
**
kwargs
):
"""
Set auto parallel context.
...
...
@@ -371,8 +372,9 @@ def set_auto_parallel_context(**kwargs):
strategy_ckpt_load_file (str): The path to load parallel strategy checkpoint. Default: ''
strategy_ckpt_save_file (str): The path to save parallel strategy checkpoint. Default: ''
full_batch (bool): Whether to load the whole batch on each device. Default: False.
enable_parallel_optimizer(bool): This is a developing feature, which shards the weight update computation in
enable_parallel_optimizer
(bool): This is a developing feature, which shards the weight update computation in
data parallel training in the benefit of time and memory saving.
all_reduce_fusion_config (list): Set allreduce fusion strategy by parameters indices.
Raises:
ValueError: If input key is not attribute in auto parallel context.
...
...
mindspore/parallel/_auto_parallel_context.py
浏览文件 @
f2d3fd34
...
...
@@ -462,7 +462,8 @@ _set_auto_parallel_context_func_map = {
"strategy_ckpt_load_file"
:
auto_parallel_context
().
set_strategy_ckpt_load_file
,
"strategy_ckpt_save_file"
:
auto_parallel_context
().
set_strategy_ckpt_save_file
,
"full_batch"
:
auto_parallel_context
().
set_full_batch
,
"enable_parallel_optimizer"
:
auto_parallel_context
().
set_enable_parallel_optimizer
}
"enable_parallel_optimizer"
:
auto_parallel_context
().
set_enable_parallel_optimizer
,
"all_reduce_fusion_config"
:
auto_parallel_context
().
set_all_reduce_fusion_split_indices
}
_get_auto_parallel_context_func_map
=
{
...
...
@@ -477,13 +478,15 @@ _get_auto_parallel_context_func_map = {
"strategy_ckpt_load_file"
:
auto_parallel_context
().
get_strategy_ckpt_load_file
,
"strategy_ckpt_save_file"
:
auto_parallel_context
().
get_strategy_ckpt_save_file
,
"full_batch"
:
auto_parallel_context
().
get_full_batch
,
"enable_parallel_optimizer"
:
auto_parallel_context
().
get_enable_parallel_optimizer
}
"enable_parallel_optimizer"
:
auto_parallel_context
().
get_enable_parallel_optimizer
,
"all_reduce_fusion_config"
:
auto_parallel_context
().
get_all_reduce_fusion_split_indices
}
@
args_type_check
(
device_num
=
int
,
global_rank
=
int
,
gradients_mean
=
bool
,
gradient_fp32_sync
=
bool
,
loss_repeated_mean
=
bool
,
parallel_mode
=
str
,
auto_parallel_search_mode
=
str
,
parameter_broadcast
=
bool
,
strategy_ckpt_load_file
=
str
,
strategy_ckpt_save_file
=
str
,
full_batch
=
bool
,
enable_parallel_optimizer
=
bool
)
strategy_ckpt_save_file
=
str
,
full_batch
=
bool
,
enable_parallel_optimizer
=
bool
,
all_reduce_fusion_config
=
list
)
def
_set_auto_parallel_context
(
**
kwargs
):
"""
...
...
@@ -526,6 +529,7 @@ def _set_auto_parallel_context(**kwargs):
strategy_ckpt_save_file (str): The path to save parallel strategy checkpoint. Default: ''
full_batch (bool): Whether to load the whole batch on each device. Default: False.
enable_parallel_optimizer (bool): Enable using optimizer segmentation or not. Default: False.
all_reduce_fusion_config (list): Set allreduce fusion strategy by parameters indices.
Raises:
ValueError: If input key is not attribute in auto parallel context.
...
...
model_zoo/official/cv/mobilenetv2/src/utils.py
浏览文件 @
f2d3fd34
...
...
@@ -47,8 +47,8 @@ def context_device_init(config):
if
config
.
run_distribute
:
context
.
set_auto_parallel_context
(
device_num
=
config
.
rank_size
,
parallel_mode
=
ParallelMode
.
DATA_PARALLEL
,
parameter_broadcast
=
True
,
gradients_mean
=
True
)
auto_parallel_context
().
set_all_reduce_fusion_split_indices
(
[
140
])
parameter_broadcast
=
True
,
gradients_mean
=
True
,
all_reduce_fusion_config
=
[
140
])
init
()
else
:
raise
ValueError
(
"Only support CPU, GPU and Ascend."
)
...
...
model_zoo/official/cv/resnet/train.py
浏览文件 @
f2d3fd34
...
...
@@ -18,7 +18,6 @@ import argparse
import
ast
from
mindspore
import
context
from
mindspore
import
Tensor
from
mindspore.parallel._auto_parallel_context
import
auto_parallel_context
from
mindspore.nn.optim.momentum
import
Momentum
from
mindspore.train.model
import
Model
from
mindspore.context
import
ParallelMode
...
...
@@ -78,9 +77,9 @@ if __name__ == '__main__':
context
.
set_auto_parallel_context
(
device_num
=
args_opt
.
device_num
,
parallel_mode
=
ParallelMode
.
DATA_PARALLEL
,
gradients_mean
=
True
)
if
args_opt
.
net
==
"resnet50"
or
args_opt
.
net
==
"se-resnet50"
:
auto_parallel_context
().
set_all_reduce_fusion_split_indices
([
85
,
16
0
])
context
.
set_auto_parallel_context
(
all_reduce_fusion_config
=
[
85
,
15
0
])
else
:
auto_parallel_context
().
set_all_reduce_fusion_split_indices
(
[
180
,
313
])
context
.
set_auto_parallel_context
(
all_reduce_fusion_config
=
[
180
,
313
])
init
()
# GPU target
else
:
...
...
@@ -88,7 +87,7 @@ if __name__ == '__main__':
context
.
set_auto_parallel_context
(
device_num
=
get_group_size
(),
parallel_mode
=
ParallelMode
.
DATA_PARALLEL
,
gradients_mean
=
True
)
if
args_opt
.
net
==
"resnet50"
:
auto_parallel_context
().
set_all_reduce_fusion_split_indices
(
[
85
,
160
])
context
.
set_auto_parallel_context
(
all_reduce_fusion_config
=
[
85
,
160
])
ckpt_save_dir
=
config
.
save_checkpoint_path
+
"ckpt_"
+
str
(
get_rank
())
+
"/"
# create dataset
...
...
model_zoo/official/cv/resnet50_quant/train.py
浏览文件 @
f2d3fd34
...
...
@@ -19,7 +19,6 @@ import argparse
from
mindspore
import
context
from
mindspore
import
Tensor
from
mindspore.parallel._auto_parallel_context
import
auto_parallel_context
from
mindspore.nn.optim.momentum
import
Momentum
from
mindspore.train.model
import
Model
from
mindspore.context
import
ParallelMode
...
...
@@ -80,8 +79,7 @@ if __name__ == '__main__':
init
()
context
.
set_auto_parallel_context
(
device_num
=
args_opt
.
device_num
,
parallel_mode
=
ParallelMode
.
DATA_PARALLEL
,
gradients_mean
=
True
)
auto_parallel_context
().
set_all_reduce_fusion_split_indices
([
107
,
160
])
gradients_mean
=
True
,
all_reduce_fusion_config
=
[
107
,
160
])
# define network
net
=
resnet50_quant
(
class_num
=
config
.
class_num
)
...
...
model_zoo/official/cv/resnet_thor/train.py
浏览文件 @
f2d3fd34
...
...
@@ -20,7 +20,6 @@ import numpy as np
from
mindspore
import
context
from
mindspore
import
Tensor
from
mindspore.common
import
set_seed
from
mindspore.parallel._auto_parallel_context
import
auto_parallel_context
from
mindspore.context
import
ParallelMode
from
mindspore.train.callback
import
ModelCheckpoint
,
CheckpointConfig
,
TimeMonitor
,
LossMonitor
from
mindspore.train.loss_scale_manager
import
FixedLossScaleManager
...
...
@@ -94,15 +93,13 @@ if __name__ == '__main__':
device_id
=
int
(
os
.
getenv
(
'DEVICE_ID'
))
context
.
set_context
(
device_id
=
device_id
,
enable_auto_mixed_precision
=
True
)
context
.
set_auto_parallel_context
(
device_num
=
args_opt
.
device_num
,
parallel_mode
=
ParallelMode
.
DATA_PARALLEL
,
gradients_mean
=
True
)
auto_parallel_context
().
set_all_reduce_fusion_split_indices
([
107
])
gradients_mean
=
True
,
all_reduce_fusion_config
=
[
107
])
init
()
# GPU target
else
:
init
()
context
.
set_auto_parallel_context
(
device_num
=
get_group_size
(),
parallel_mode
=
ParallelMode
.
DATA_PARALLEL
,
gradients_mean
=
True
)
auto_parallel_context
().
set_all_reduce_fusion_split_indices
([
107
])
gradients_mean
=
True
,
all_reduce_fusion_config
=
[
104
])
ckpt_save_dir
=
config
.
save_checkpoint_path
+
"ckpt_"
+
str
(
get_rank
())
+
"/"
# create dataset
...
...
model_zoo/official/nlp/bert/run_pretrain.py
浏览文件 @
f2d3fd34
...
...
@@ -87,17 +87,16 @@ def run_pretrain():
context
.
reset_auto_parallel_context
()
context
.
set_auto_parallel_context
(
parallel_mode
=
ParallelMode
.
DATA_PARALLEL
,
gradients_mean
=
True
,
device_num
=
device_num
)
from
mindspore.parallel._auto_parallel_context
import
auto_parallel_context
if
bert_net_cfg
.
num_hidden_layers
==
12
:
if
bert_net_cfg
.
use_relative_positions
:
auto_parallel_context
().
set_all_reduce_fusion_split_indices
(
[
29
,
58
,
87
,
116
,
145
,
174
,
203
,
217
])
context
.
set_auto_parallel_context
(
all_reduce_fusion_config
=
[
29
,
58
,
87
,
116
,
145
,
174
,
203
,
217
])
else
:
auto_parallel_context
().
set_all_reduce_fusion_split_indices
(
[
28
,
55
,
82
,
109
,
136
,
163
,
190
,
205
])
context
.
set_auto_parallel_context
(
all_reduce_fusion_config
=
[
28
,
55
,
82
,
109
,
136
,
163
,
190
,
205
])
elif
bert_net_cfg
.
num_hidden_layers
==
24
:
if
bert_net_cfg
.
use_relative_positions
:
auto_parallel_context
().
set_all_reduce_fusion_split_indices
(
[
30
,
90
,
150
,
210
,
270
,
330
,
390
,
421
])
context
.
set_auto_parallel_context
(
all_reduce_fusion_config
=
[
30
,
90
,
150
,
210
,
270
,
330
,
390
,
421
])
else
:
auto_parallel_context
().
set_all_reduce_fusion_split_indices
(
[
38
,
93
,
148
,
203
,
258
,
313
,
368
,
397
])
context
.
set_auto_parallel_context
(
all_reduce_fusion_config
=
[
38
,
93
,
148
,
203
,
258
,
313
,
368
,
397
])
else
:
rank
=
0
device_num
=
1
...
...
tests/st/networks/models/resnet50/test_resnet50_imagenet.py
浏览文件 @
f2d3fd34
...
...
@@ -23,7 +23,6 @@ import numpy as np
from
mindspore
import
context
,
Tensor
from
mindspore.communication.management
import
init
from
mindspore.parallel._auto_parallel_context
import
auto_parallel_context
from
mindspore.train.model
import
Model
from
mindspore.context
import
ParallelMode
from
mindspore.train.callback
import
Callback
...
...
@@ -137,8 +136,8 @@ def train_process(q, device_id, epoch_size, device_num, enable_hccl):
os
.
environ
[
'RANK_SIZE'
]
=
str
(
device_num
)
if
enable_hccl
:
context
.
set_auto_parallel_context
(
device_num
=
device_num
,
parallel_mode
=
ParallelMode
.
DATA_PARALLEL
,
gradients_mean
=
True
,
parameter_broadcast
=
True
)
auto_parallel_context
().
set_all_reduce_fusion_split_indices
(
[
107
,
160
])
gradients_mean
=
True
,
parameter_broadcast
=
True
,
all_reduce_fusion_config
=
[
107
,
160
])
init
()
# network
...
...
@@ -240,8 +239,8 @@ def train_process_thor(q, device_id, epoch_size, device_num, enable_hccl):
os
.
environ
[
'RANK_SIZE'
]
=
str
(
device_num
)
if
enable_hccl
:
context
.
set_auto_parallel_context
(
device_num
=
device_num
,
parallel_mode
=
ParallelMode
.
DATA_PARALLEL
,
gradients_mean
=
True
,
parameter_broadcast
=
True
)
auto_parallel_context
().
set_all_reduce_fusion_split_indices
(
[
107
])
gradients_mean
=
True
,
parameter_broadcast
=
True
,
all_reduce_fusion_config
=
[
107
])
init
()
# network
...
...
tests/st/tbe_networks/resnet_cifar.py
浏览文件 @
f2d3fd34
...
...
@@ -31,7 +31,6 @@ from mindspore import context
from
mindspore.communication.management
import
init
from
mindspore.nn.optim.momentum
import
Momentum
from
mindspore.ops
import
operations
as
P
from
mindspore.parallel._auto_parallel_context
import
auto_parallel_context
from
mindspore.train.model
import
Model
from
mindspore.context
import
ParallelMode
...
...
@@ -124,8 +123,8 @@ class CrossEntropyLoss(nn.Cell):
if
__name__
==
'__main__'
:
if
not
args_opt
.
do_eval
and
args_opt
.
run_distribute
:
context
.
set_auto_parallel_context
(
device_num
=
args_opt
.
device_num
,
parallel_mode
=
ParallelMode
.
DATA_PARALLEL
)
auto_parallel_context
().
set_all_reduce_fusion_split_indices
(
[
140
])
context
.
set_auto_parallel_context
(
device_num
=
args_opt
.
device_num
,
parallel_mode
=
ParallelMode
.
DATA_PARALLEL
,
all_reduce_fusion_config
=
[
140
])
init
()
context
.
set_context
(
mode
=
context
.
GRAPH_MODE
)
...
...
tests/st/tbe_networks/test_resnet_cifar_8p.py
浏览文件 @
f2d3fd34
...
...
@@ -30,7 +30,6 @@ from mindspore import context
from
mindspore.communication.management
import
init
from
mindspore.nn.optim.momentum
import
Momentum
from
mindspore.ops
import
operations
as
P
from
mindspore.parallel._auto_parallel_context
import
auto_parallel_context
from
mindspore.train.callback
import
Callback
from
mindspore.train.model
import
Model
from
mindspore.context
import
ParallelMode
...
...
@@ -154,8 +153,7 @@ def train_process(q, device_id, epoch_size, num_classes, device_num, batch_size,
os
.
environ
[
'RANK_SIZE'
]
=
str
(
device_num
)
if
enable_hccl
:
context
.
set_auto_parallel_context
(
device_num
=
device_num
,
parallel_mode
=
ParallelMode
.
DATA_PARALLEL
)
auto_parallel_context
().
set_all_reduce_fusion_split_indices
([
140
])
device_num
=
device_num
,
parallel_mode
=
ParallelMode
.
DATA_PARALLEL
,
all_reduce_fusion_config
=
[
140
])
init
()
context
.
set_context
(
mode
=
context
.
GRAPH_MODE
)
net
=
resnet50
(
batch_size
,
num_classes
)
...
...
tests/ut/python/parallel/test_parallel_optimizer.py
浏览文件 @
f2d3fd34
...
...
@@ -23,7 +23,6 @@ from mindspore.nn import TrainOneStepCell, WithLossCell
from
mindspore.nn.optim
import
Adam
,
AdamWeightDecay
,
Lamb
from
mindspore.ops
import
operations
as
P
from
mindspore
import
context
from
mindspore.parallel._auto_parallel_context
import
auto_parallel_context
class
Net
(
nn
.
Cell
):
"""Net definition"""
...
...
@@ -85,8 +84,8 @@ def test_lamb_compile():
def
test_lamb_split_fusion
():
""" test_Lamb_split_fusion """
context
.
set_auto_parallel_context
(
parallel_mode
=
"data_parallel"
,
device_num
=
2
,
enable_parallel_optimizer
=
True
)
auto_parallel_context
().
set_all_reduce_fusion_split_indices
(
[
2
,
4
,
6
,
8
])
context
.
set_auto_parallel_context
(
parallel_mode
=
"data_parallel"
,
device_num
=
2
,
enable_parallel_optimizer
=
True
,
all_reduce_fusion_config
=
[
2
,
4
,
6
,
8
])
inputs
=
Tensor
(
np
.
ones
([
32
,
128
]).
astype
(
np
.
float32
))
label
=
Tensor
(
np
.
zeros
([
32
,
768
]).
astype
(
np
.
float32
))
net
=
Net
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录