Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
e50fa074
MegEngine
项目概览
MegEngine 天元
/
MegEngine
接近 2 年 前同步成功
通知
414
Star
4708
Fork
583
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
e50fa074
编写于
9月 04, 2020
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix(mge/imperative): remove backward from optimizer
GitOrigin-RevId: ad6ad444faf0e39ba6a5936c5f3a57a3b2406c75
上级
60702667
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
6 addition
and
243 deletion
+6
-243
imperative/python/megengine/optimizer/distributed_optimizer.py
...ative/python/megengine/optimizer/distributed_optimizer.py
+0
-120
imperative/python/megengine/optimizer/optimizer.py
imperative/python/megengine/optimizer/optimizer.py
+6
-123
未找到文件。
imperative/python/megengine/optimizer/distributed_optimizer.py
已删除
100644 → 0
浏览文件 @
60702667
# -*- coding: utf-8 -*-
# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
#
# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
from
typing
import
Iterable
as
Iter
from
typing
import
Optional
,
Union
from
..device
import
get_default_device
from
..distributed.group
import
get_client
,
is_distributed
from
..functional
import
add_update
from
..functional.distributed
import
WORLD
,
Group
,
all_reduce_sum
,
broadcast
from
..functional.utils
import
copy
from
..tensor
import
Tensor
,
TensorDict
from
..tensor_nn
import
Parameter
from
.optimizer
import
Optimizer
from
.param_pack
import
get_pack_list
,
pack_allreduce_split
class
DistributedOptimizer
(
Optimizer
):
r
"""Add Distributed Func for distributed training.
:param params: specifies what Tensors should be optimized.
:param defaults: a dict of default parameters of Optimizer, like learning rate or momentum.
:param reduce_method: use all_reduce_sum or all_reduce_mean to reduce gradients
:param bcast_period: broadcasts params every *bcast_period* iterations.
if it equals to 0, it will broadcast params only at the beginning. Default: 500
:param param_pack: whether to pack gradients to avoid small packages send/recv. Default: False
:param param_pack_thd: max size of packed gradients by bytes. Default: 10 * 1024 * 1024
"""
def
__init__
(
self
,
params
:
Union
[
Iter
[
Parameter
],
dict
],
defaults
:
dict
,
reduce_method
:
Optional
[
str
]
=
None
,
dist_group
:
Optional
[
Group
]
=
WORLD
,
bcast_period
:
int
=
0
,
param_pack
:
bool
=
False
,
param_pack_thd
:
int
=
10
*
1024
*
1024
,
):
if
is_distributed
():
assert
reduce_method
in
[
"sum"
,
"mean"
],
"reduce_method must be specified"
defaults
[
"orders"
]
=
[]
defaults
[
"dist_group"
]
=
dist_group
super
().
__init__
(
params
,
defaults
)
self
.
_bcast_period
=
bcast_period
self
.
_param_pack
=
param_pack
self
.
_param_pack_thd
=
param_pack_thd
self
.
_reduce_method
=
reduce_method
self
.
add_save_load_state_ignore_keys
(
{
"grads"
,
"orders"
,
"pack_list"
,
"shape_list"
,
"dist_group"
}
)
if
is_distributed
()
and
bcast_period
!=
-
1
:
self
.
bcast_param
()
def
grad_callback
(
self
,
grad
,
i
,
group
):
if
is_distributed
()
and
group
[
"dist_group"
]
is
not
None
:
dist_group
=
group
[
"dist_group"
]
if
self
.
_param_pack
and
"pack_list"
in
group
:
for
pack
,
shapes
in
zip
(
group
[
"pack_list"
],
group
[
"shape_list"
]):
if
i
==
pack
[
-
1
]:
pack_allreduce_split
(
group
,
pack
,
shapes
,
self
.
_reduce_method
)
else
:
group
[
"orders"
].
append
(
i
)
group
[
"grads"
][
i
]
=
all_reduce_sum
(
grad
,
dist_group
,
dist_group
.
comp_node
)
if
self
.
_reduce_method
==
"mean"
:
group
[
"grads"
][
i
]
/=
dist_group
.
size
def
_gen_pack_list
(
self
,
group
):
if
"pack_list"
not
in
group
:
dist_group
=
group
[
"dist_group"
]
if
dist_group
.
rank
==
0
:
pack_list
,
shape_list
=
get_pack_list
(
group
,
self
.
_param_pack_thd
)
get_client
().
set_pack_list
(
dist_group
.
key
,
(
pack_list
,
shape_list
))
else
:
pack_list
,
shape_list
=
get_client
().
get_pack_list
(
dist_group
.
key
)
group
[
"pack_list"
]
=
pack_list
group
[
"shape_list"
]
=
shape_list
def
backward
(
self
,
loss
:
Tensor
):
ret
=
super
().
backward
(
loss
)
if
is_distributed
():
for
group
in
self
.
param_groups
:
if
self
.
_param_pack
and
group
[
"dist_group"
]
is
not
None
:
self
.
_gen_pack_list
(
group
)
return
ret
def
step
(
self
):
if
is_distributed
():
for
group
in
self
.
param_groups
:
device
=
get_default_device
()
for
param
in
group
[
"params"
]:
if
param
.
__wrapped__
not
in
self
.
_grad_skip
:
if
param
.
grad
.
device
!=
device
:
param
.
grad
=
copy
(
param
.
grad
,
device
)
if
self
.
_bcast_period
>
0
:
self
.
_bcast_iter
+=
1
if
self
.
_bcast_iter
==
self
.
_bcast_period
:
self
.
bcast_param
()
self
.
_bcast_iter
=
0
super
().
step
()
def
bcast_param
(
self
):
device
=
get_default_device
()
for
group
in
self
.
param_groups
:
for
param
in
group
[
"params"
]:
dist_group
=
group
[
"dist_group"
]
new_param
=
broadcast
(
param
,
dist_group
)
if
new_param
.
device
!=
device
:
new_param
=
copy
(
new_param
,
device
)
add_update
(
param
,
new_param
,
alpha
=
0
)
param
.
_reset
(
new_param
)
imperative/python/megengine/optimizer/optimizer.py
浏览文件 @
e50fa074
...
@@ -11,22 +11,13 @@ from collections import Iterable
...
@@ -11,22 +11,13 @@ from collections import Iterable
from
contextlib
import
contextmanager
from
contextlib
import
contextmanager
from
typing
import
Dict
from
typing
import
Dict
from
typing
import
Iterable
as
Iter
from
typing
import
Iterable
as
Iter
from
typing
import
Set
,
Union
from
typing
import
Union
import
numpy
as
np
import
numpy
as
np
from
..core.autodiff.grad
import
Grad
from
..device
import
get_default_device
from
..distributed.group
import
get_client
,
is_distributed
from
..functional
import
add_update
from
..functional.distributed
import
all_reduce_sum
,
broadcast
from
..functional.utils
import
copy
from
..logger
import
get_logger
from
..tensor
import
Tensor
,
TensorDict
from
..tensor
import
Tensor
,
TensorDict
from
..tensor_nn
import
Buffer
,
Parameter
from
..tensor_nn
import
Buffer
,
Parameter
logger
=
get_logger
(
__name__
)
class
_RequiredParameter
:
class
_RequiredParameter
:
def
__repr__
(
self
):
def
__repr__
(
self
):
...
@@ -43,10 +34,6 @@ class Optimizer(metaclass=ABCMeta):
...
@@ -43,10 +34,6 @@ class Optimizer(metaclass=ABCMeta):
:param defaults: a dict of default parameters of Optimizer, like learning rate or momentum.
:param defaults: a dict of default parameters of Optimizer, like learning rate or momentum.
"""
"""
_recording
=
None
_grad
=
None
_gradients
=
None
def
__init__
(
# pylint: disable=too-many-branches
def
__init__
(
# pylint: disable=too-many-branches
self
,
params
:
Union
[
Iter
[
Parameter
],
dict
],
defaults
:
dict
,
self
,
params
:
Union
[
Iter
[
Parameter
],
dict
],
defaults
:
dict
,
):
):
...
@@ -63,7 +50,6 @@ class Optimizer(metaclass=ABCMeta):
...
@@ -63,7 +50,6 @@ class Optimizer(metaclass=ABCMeta):
)
)
self
.
param_groups
=
[]
# type: list
self
.
param_groups
=
[]
# type: list
self
.
save_load_state_ignore_keys
=
set
()
param_groups
=
list
(
params
)
param_groups
=
list
(
params
)
if
len
(
param_groups
)
==
0
:
if
len
(
param_groups
)
==
0
:
...
@@ -154,100 +140,6 @@ class Optimizer(metaclass=ABCMeta):
...
@@ -154,100 +140,6 @@ class Optimizer(metaclass=ABCMeta):
params
.
append
(
param
)
params
.
append
(
param
)
return
params
return
params
def
grad_callback
(
self
,
grad
,
i
,
group
):
pass
def
record
(
self
):
@
contextmanager
def
recorder
():
params
=
self
.
_get_params
()
grad
=
Grad
()
gradients
=
[
None
]
*
len
(
params
)
if
self
.
_recording
:
raise
RuntimeError
(
"already recording!"
)
try
:
self
.
_recording
=
True
self
.
_grad
=
grad
for
group
in
self
.
param_groups
:
group
[
"grads"
]
=
[
None
]
*
len
(
group
[
"params"
])
for
i
,
param
in
enumerate
(
group
[
"params"
]):
def
callback
(
tensor
,
grad
,
i
=
i
,
group
=
group
,
self
=
self
):
group
[
"grads"
][
i
]
=
grad
self
.
grad_callback
(
grad
,
i
,
group
)
grad
.
wrt
(
param
,
callback
=
callback
)
with
grad
:
yield
finally
:
self
.
_recording
=
False
self
.
_grad
=
None
for
group
in
self
.
param_groups
:
group
[
"grads"
]
=
[]
return
recorder
()
def
_calculate_gradients
(
self
,
loss
:
Tensor
):
if
not
self
.
_recording
:
raise
RuntimeError
(
"no computation history. "
"did you forget record() or "
"call a method that clears the history?"
)
assert
self
.
_grad
is
not
None
if
len
(
loss
.
__wrapped__
.
_extra_data
)
==
0
:
# in case loss depends on no tensor
self
.
_grad
=
None
return
one
=
Tensor
([
1.0
],
dtype
=
loss
.
dtype
,
device
=
loss
.
device
)
one
=
one
.
reshape
(
loss
.
shape
)
try
:
self
.
_grad
(
loss
,
one
)
finally
:
self
.
_grad
=
None
def
minimize
(
self
,
loss
:
Tensor
):
self
.
backward
(
loss
)
self
.
step
()
def
backward
(
self
,
loss
:
Tensor
):
"""Computes the back-propagation of the network given loss.
:param loss: The obtained loss tensor
"""
rst
=
[]
self
.
_calculate_gradients
(
loss
)
# _grad_skip records the parameters which are not in the path of backward
self
.
_grad_skip
=
set
()
for
group
in
self
.
param_groups
:
# _grad_skip is consumed in optimizer.step()
# XXX: assumptions
# 1. Assume the same execution sequence for all GPUs in data parallel
# 2. If backward is called by multiple times to accumulate grad,
# it's also assumed same _grad_skip for all backward() calls
# Please change the code if any assumption is invalid
for
param
,
grad
in
zip
(
group
[
"params"
],
group
[
"grads"
]):
if
grad
is
None
:
self
.
_grad_skip
.
add
(
param
.
__wrapped__
)
continue
grad
=
Buffer
(
grad
)
if
getattr
(
param
,
"grad"
,
None
)
is
None
:
param
.
grad
=
grad
else
:
assert
isinstance
(
param
.
grad
,
Buffer
)
param
.
grad
+=
grad
rst
.
append
(
param
.
grad
)
if
len
(
self
.
_grad_skip
)
>
0
:
get_logger
(
__name__
).
warning
(
"{} parameters have no grad! "
"Make sure you pass the right parameters list"
.
format
(
len
(
self
.
_grad_skip
)
)
)
return
rst
def
step
(
self
):
def
step
(
self
):
r
"""Performs a single optimization step.
r
"""Performs a single optimization step.
...
@@ -261,8 +153,8 @@ class Optimizer(metaclass=ABCMeta):
...
@@ -261,8 +153,8 @@ class Optimizer(metaclass=ABCMeta):
)
)
self
.
_updates
(
group
)
self
.
_updates
(
group
)
def
zero
_grad
(
self
):
def
clear
_grad
(
self
):
r
"""
Reset the grad to zeros
.
r
"""
Clear the grad buffer
.
"""
"""
for
param_group
in
self
.
param_groups
:
for
param_group
in
self
.
param_groups
:
...
@@ -270,9 +162,6 @@ class Optimizer(metaclass=ABCMeta):
...
@@ -270,9 +162,6 @@ class Optimizer(metaclass=ABCMeta):
if
getattr
(
param
,
"grad"
,
None
)
is
not
None
:
if
getattr
(
param
,
"grad"
,
None
)
is
not
None
:
param
.
grad
=
None
param
.
grad
=
None
def
add_save_load_state_ignore_keys
(
self
,
keys
:
Set
[
str
]):
self
.
save_load_state_ignore_keys
|=
keys
def
state_dict
(
self
)
->
Dict
:
def
state_dict
(
self
)
->
Dict
:
r
"""Export the optimizer state.
r
"""Export the optimizer state.
...
@@ -293,11 +182,7 @@ class Optimizer(metaclass=ABCMeta):
...
@@ -293,11 +182,7 @@ class Optimizer(metaclass=ABCMeta):
state
[
param2id
[
param
]]
=
st
state
[
param2id
[
param
]]
=
st
for
group
in
self
.
param_groups
:
for
group
in
self
.
param_groups
:
param_group
=
{
param_group
=
{
k
:
v
for
k
,
v
in
group
.
items
()
if
k
!=
"params"
}
k
:
v
for
k
,
v
in
group
.
items
()
if
k
!=
"params"
and
k
not
in
self
.
save_load_state_ignore_keys
}
param_group
[
"params"
]
=
[
param2id
[
param
]
for
param
in
group
[
"params"
]]
param_group
[
"params"
]
=
[
param2id
[
param
]
for
param
in
group
[
"params"
]]
param_groups
.
append
(
param_group
)
param_groups
.
append
(
param_group
)
...
@@ -329,14 +214,12 @@ class Optimizer(metaclass=ABCMeta):
...
@@ -329,14 +214,12 @@ class Optimizer(metaclass=ABCMeta):
if
isinstance
(
v
,
Buffer
):
if
isinstance
(
v
,
Buffer
):
self
.
_state
[
p
][
k
]
=
Buffer
(
v
.
numpy
())
self
.
_state
[
p
][
k
]
=
Buffer
(
v
.
numpy
())
new_keys
=
set
(
group_new
.
keys
())
-
self
.
save_load_state_ignore_keys
if
set
(
group_new
.
keys
())
!=
set
(
group_saved
.
keys
()):
saved_keys
=
set
(
group_saved
.
keys
())
-
self
.
save_load_state_ignore_keys
if
new_keys
!=
saved_keys
:
raise
ValueError
(
raise
ValueError
(
"loaded state dict contains a parameter group that "
"loaded state dict contains a parameter group that "
"doesn't match the keys of optimizer's group"
"doesn't match the keys of optimizer's group"
)
)
for
key
in
saved_keys
:
for
key
in
group_new
.
keys
()
:
if
key
!=
"params"
:
if
key
!=
"params"
:
group_new
[
key
]
=
group_saved
[
key
]
group_new
[
key
]
=
group_saved
[
key
]
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录