Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
48c734b6
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
48c734b6
编写于
6月 27, 2020
作者:
L
lirongzhen1
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
auto parallel for sparse gradient
上级
87213648
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
57 addition
and
15 deletion
+57
-15
mindspore/nn/wrap/grad_reducer.py
mindspore/nn/wrap/grad_reducer.py
+57
-15
未找到文件。
mindspore/nn/wrap/grad_reducer.py
浏览文件 @
48c734b6
...
...
@@ -16,18 +16,22 @@
from
mindspore.nn.cell
import
Cell
from
mindspore.communication.management
import
GlobalComm
,
get_group_size
from
mindspore.ops
import
functional
as
F
,
composite
as
C
,
operations
as
P
from
mindspore.ops.operations.comm_ops
import
AllReduce
,
ReduceOp
from
mindspore.ops.operations.comm_ops
import
AllReduce
,
ReduceOp
,
AllGather
import
mindspore.common.dtype
as
mstype
reduce_opt
=
C
.
MultitypeFuncGraph
(
"reduce_opt"
)
_all_reduce
=
AllReduce
()
_all_gather
=
None
def
_init_optimizer_
allreduce
():
def
_init_optimizer_
communication
():
global
_all_reduce
global
_all_gather
_all_reduce
=
AllReduce
(
ReduceOp
.
SUM
,
GlobalComm
.
WORLD_COMM_GROUP
)
_all_reduce
.
add_prim_attr
(
'fusion'
,
1
)
_all_gather
=
AllGather
(
GlobalComm
.
WORLD_COMM_GROUP
)
@
reduce_opt
.
register
(
"Function"
,
"Number"
,
"Bool"
,
"Tensor"
)
...
...
@@ -72,8 +76,8 @@ def _tensors_allreduce_mean_with_sparse(mul, degree, allreduce_filter, grad):
degree
=
F
.
scalar_cast
(
degree
,
F
.
dtype
(
grad
[
1
]))
dout
=
_all_gather
(
grad
[
1
])
cast_op
=
P
.
Cast
()
dout
=
mul
(
dout
,
cast_op
(
F
.
scalar_to_array
(
1.0
/
degree
),
F
.
dtype
(
dout
)))
grad
=
(
indices
,
dout
,
dout
[
2
])
dout
=
mul
(
dout
,
cast_op
(
F
.
scalar_to_array
(
1.0
/
degree
),
F
.
dtype
(
dout
)))
grad
=
(
indices
,
dout
,
grad
[
2
])
return
grad
...
...
@@ -110,7 +114,7 @@ def _tensors_allreduce_with_sparse(allreduce_filter, grad):
if
allreduce_filter
:
indices
=
_all_gather
(
grad
[
0
])
dout
=
_all_gather
(
grad
[
1
])
grad
=
(
indices
,
dout
,
dout
[
2
])
grad
=
(
indices
,
dout
,
grad
[
2
])
return
grad
...
...
@@ -131,6 +135,20 @@ def _tensors_get_datatype(grad):
return
F
.
dtype
(
grad
)
@
_get_datatype
.
register
(
"Tuple"
)
def
_tensors_get_datatype_with_sparse
(
grad
):
"""
Acquire gradient datatype.
Args:
grad (Tuple): The gradient tensor before operation.
Returns:
mstype, the datatype of gradient.
"""
return
F
.
dtype
(
grad
[
1
])
_cast_datatype
=
C
.
MultitypeFuncGraph
(
"_cast_datatype"
)
...
...
@@ -149,6 +167,22 @@ def _tensors_cast_datatype(datatype, grad):
return
F
.
cast
(
grad
,
datatype
)
@
_cast_datatype
.
register
(
"TypeType"
,
"Tuple"
)
def
_tensors_cast_datatype_with_sparse
(
datatype
,
grad
):
"""
Cast gradient to datatype.
Args:
datatype (mstype): the destination datatype of gradient.
grad (Tuple): The gradient tensor before operation.
Returns:
Tuple, the gradient tuple after operation.
"""
dout
=
F
.
cast
(
grad
[
1
],
datatype
)
return
(
grad
[
0
],
dout
,
grad
[
2
])
class
DistributedGradReducer
(
Cell
):
"""
A distributed optimizer.
...
...
@@ -224,7 +258,7 @@ class DistributedGradReducer(Cell):
def
__init__
(
self
,
parameters
,
mean
=
True
,
degree
=
None
):
super
(
DistributedGradReducer
,
self
).
__init__
(
auto_prefix
=
False
)
self
.
hyper_map
=
C
.
Hyper
Map
()
self
.
map_
=
C
.
Map
()
self
.
mul
=
P
.
Mul
()
if
degree
is
None
:
self
.
degree
=
get_group_size
()
...
...
@@ -234,19 +268,27 @@ class DistributedGradReducer(Cell):
self
.
degree
=
degree
self
.
mean
=
mean
self
.
allreduce_filter
=
tuple
(
x
.
layerwise_parallel
is
False
for
x
in
parameters
)
_init_optimizer_
allreduce
()
_init_optimizer_
communication
()
def
construct
(
self
,
grads
):
# In some circumstances, the data precision of grads could be mixed with float16 and float32. Thus, the
# result of AllReduce is unreliable. To solve the problem, grads should be cast to float32 before AllReduce,
# and cast back after the operation.
datatypes
=
self
.
hyper_map
(
F
.
partial
(
_get_datatype
),
grads
)
grads
=
self
.
hyper_map
(
F
.
partial
(
_cast_datatype
,
mstype
.
float32
),
grads
)
"""
In some circumstances, the data precision of grads could be mixed with float16 and float32. Thus, the
result of AllReduce is unreliable. To solve the problem, grads should be cast to float32 before AllReduce,
and cast back after the operation.
Args:
grads (Union[Tensor, tuple[Tensor]]): The gradient tensor or tuple before operation.
Returns:
new_grads (Union[Tensor, tuple[Tensor]]), the gradient tensor or tuple after operation.
"""
datatypes
=
self
.
map_
(
F
.
partial
(
_get_datatype
),
grads
)
grads
=
self
.
map_
(
F
.
partial
(
_cast_datatype
,
mstype
.
float32
),
grads
)
if
self
.
mean
:
new_grad
=
self
.
hyper_map
(
F
.
partial
(
reduce_opt
,
self
.
mul
,
self
.
degree
),
self
.
allreduce_filter
,
grads
)
new_grad
=
self
.
map_
(
F
.
partial
(
reduce_opt
,
self
.
mul
,
self
.
degree
),
self
.
allreduce_filter
,
grads
)
else
:
new_grad
=
self
.
hyper_map
(
F
.
partial
(
reduce_opt
),
self
.
allreduce_filter
,
grads
)
new_grad
=
self
.
map_
(
F
.
partial
(
reduce_opt
),
self
.
allreduce_filter
,
grads
)
new_grad
=
self
.
hyper_map
(
F
.
partial
(
_cast_datatype
),
datatypes
,
new_grad
)
new_grad
=
self
.
map_
(
F
.
partial
(
_cast_datatype
),
datatypes
,
new_grad
)
return
new_grad
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录