Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
a5dc0a79
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2299
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
a5dc0a79
编写于
6月 27, 2022
作者:
W
wanghuancoder
提交者:
GitHub
6月 27, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Eager] Rename EagerPyLayer to PyLayer (#43696)
* rename eagerpylayer
上级
8a122ecc
变更
20
隐藏空白更改
内联
并排
Showing
20 changed file
with
1802 addition
and
470 deletion
+1802
-470
paddle/fluid/pybind/eager_py_layer.cc
paddle/fluid/pybind/eager_py_layer.cc
+11
-8
python/paddle/autograd/__init__.py
python/paddle/autograd/__init__.py
+7
-1
python/paddle/autograd/py_layer.py
python/paddle/autograd/py_layer.py
+7
-3
python/paddle/distributed/collective.py
python/paddle/distributed/collective.py
+2
-2
python/paddle/distributed/fleet/base/fleet_base.py
python/paddle/distributed/fleet/base/fleet_base.py
+3
-2
python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
.../paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
+7
-139
python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
...uted/fleet/meta_parallel/sharding/group_sharded_stage3.py
+3
-3
python/paddle/distributed/fleet/utils/recompute.py
python/paddle/distributed/fleet/utils/recompute.py
+22
-17
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+12
-0
python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_with_pylayer.py
...s/unittests/parallel_dygraph_dataparallel_with_pylayer.py
+2
-20
python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py
...e/fluid/tests/unittests/test_dygraph_group_sharded_api.py
+3
-2
python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api_for_eager.py
...sts/unittests/test_dygraph_group_sharded_api_for_eager.py
+35
-0
python/paddle/fluid/tests/unittests/test_dygraph_recompute.py
...on/paddle/fluid/tests/unittests/test_dygraph_recompute.py
+0
-25
python/paddle/fluid/tests/unittests/test_dygraph_recompute_for_eager.py
...fluid/tests/unittests/test_dygraph_recompute_for_eager.py
+215
-0
python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py
...dle/fluid/tests/unittests/test_dygraph_sharding_stage3.py
+3
-3
python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3_for_eager.py
...tests/unittests/test_dygraph_sharding_stage3_for_eager.py
+40
-0
python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
...d/tests/unittests/test_imperative_auto_mixed_precision.py
+4
-48
python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision_for_eager.py
...ittests/test_imperative_auto_mixed_precision_for_eager.py
+1374
-0
python/paddle/fluid/tests/unittests/test_pylayer_op.py
python/paddle/fluid/tests/unittests/test_pylayer_op.py
+44
-33
python/paddle/incubate/distributed/models/moe/moe_layer.py
python/paddle/incubate/distributed/models/moe/moe_layer.py
+8
-164
未找到文件。
paddle/fluid/pybind/eager_py_layer.cc
浏览文件 @
a5dc0a79
...
@@ -129,16 +129,19 @@ PyObject* pylayer_method_apply(PyObject* cls,
...
@@ -129,16 +129,19 @@ PyObject* pylayer_method_apply(PyObject* cls,
bool
require_any_grad
=
false
;
bool
require_any_grad
=
false
;
size_t
inputs_size
=
0
;
size_t
inputs_size
=
0
;
size_t
args_size
=
0
;
size_t
kwargs_size
=
0
;
PyObject
*
forward_args
=
nullptr
;
PyObject
*
forward_args
=
nullptr
;
PyObject
*
kwargs_value_list
=
nullptr
;
PyObject
*
kwargs_value_list
=
nullptr
;
if
(
kwargs
)
{
if
(
kwargs
)
{
input
s_size
=
PyDict_Size
(
kwargs
);
kwarg
s_size
=
PyDict_Size
(
kwargs
);
kwargs_value_list
=
PyDict_Values
(
kwargs
);
kwargs_value_list
=
PyDict_Values
(
kwargs
);
forward_args
=
PyTuple_New
(
1
);
}
else
{
inputs_size
=
PyTuple_GET_SIZE
(
args
);
forward_args
=
PyTuple_New
(
inputs_size
+
1
);
}
}
if
(
args
)
{
args_size
=
PyTuple_GET_SIZE
(
args
);
}
inputs_size
=
kwargs_size
+
args_size
;
forward_args
=
PyTuple_New
(
args_size
+
1
);
Py_INCREF
(
ctx
);
Py_INCREF
(
ctx
);
PyTuple_SET_ITEM
(
forward_args
,
0
,
reinterpret_cast
<
PyObject
*>
(
ctx
));
PyTuple_SET_ITEM
(
forward_args
,
0
,
reinterpret_cast
<
PyObject
*>
(
ctx
));
...
@@ -150,8 +153,8 @@ PyObject* pylayer_method_apply(PyObject* cls,
...
@@ -150,8 +153,8 @@ PyObject* pylayer_method_apply(PyObject* cls,
ctx
->
forward_input_tensor_is_duplicable
.
reserve
(
inputs_size
);
ctx
->
forward_input_tensor_is_duplicable
.
reserve
(
inputs_size
);
for
(
size_t
i
=
0
;
i
<
inputs_size
;
i
++
)
{
for
(
size_t
i
=
0
;
i
<
inputs_size
;
i
++
)
{
PyObject
*
obj
=
nullptr
;
PyObject
*
obj
=
nullptr
;
if
(
kwargs
)
{
if
(
i
>=
args_size
)
{
obj
=
PyList_GetItem
(
kwargs_value_list
,
i
);
obj
=
PyList_GetItem
(
kwargs_value_list
,
i
-
args_size
);
}
else
{
}
else
{
obj
=
PyTuple_GET_ITEM
(
args
,
i
);
obj
=
PyTuple_GET_ITEM
(
args
,
i
);
}
}
...
@@ -212,7 +215,7 @@ PyObject* pylayer_method_apply(PyObject* cls,
...
@@ -212,7 +215,7 @@ PyObject* pylayer_method_apply(PyObject* cls,
}
}
}
}
if
(
!
kwargs
)
{
if
(
i
<
args_size
)
{
Py_INCREF
(
obj
);
Py_INCREF
(
obj
);
PyTuple_SET_ITEM
(
forward_args
,
i
+
1
,
obj
);
PyTuple_SET_ITEM
(
forward_args
,
i
+
1
,
obj
);
}
}
...
...
python/paddle/autograd/__init__.py
浏览文件 @
a5dc0a79
...
@@ -17,7 +17,13 @@ from ..fluid.dygraph.base import no_grad_ as no_grad # noqa: F401
...
@@ -17,7 +17,13 @@ from ..fluid.dygraph.base import no_grad_ as no_grad # noqa: F401
from
..framework
import
is_grad_enabled
,
set_grad_enabled
# noqa: F401
from
..framework
import
is_grad_enabled
,
set_grad_enabled
# noqa: F401
from
.
import
backward_mode
# noqa: F401
from
.
import
backward_mode
# noqa: F401
from
.backward_mode
import
backward
# noqa: F401
from
.backward_mode
import
backward
# noqa: F401
from
.py_layer
import
PyLayer
,
PyLayerContext
,
EagerPyLayer
,
EagerPyLayerContext
# noqa: F401
from
..fluid.framework
import
_in_eager_mode_
if
_in_eager_mode_
:
from
.py_layer
import
EagerPyLayer
as
PyLayer
# noqa: F401
from
.py_layer
import
EagerPyLayerContext
as
PyLayerContext
# noqa: F401
else
:
from
.py_layer
import
LegacyPyLayer
as
PyLayer
# noqa: F401
from
.py_layer
import
LegacyPyLayerContext
as
PyLayerContext
# noqa: F401
from
..framework
import
set_grad_enabled
,
is_grad_enabled
# noqa: F401
from
..framework
import
set_grad_enabled
,
is_grad_enabled
# noqa: F401
from
..fluid.dygraph.base
import
no_grad_
as
no_grad
# noqa: F401
from
..fluid.dygraph.base
import
no_grad_
as
no_grad
# noqa: F401
from
.functional
import
vjp
,
jvp
,
Jacobian
,
Hessian
# noqa: F401
from
.functional
import
vjp
,
jvp
,
Jacobian
,
Hessian
# noqa: F401
...
...
python/paddle/autograd/py_layer.py
浏览文件 @
a5dc0a79
...
@@ -21,7 +21,7 @@ from paddle.fluid import core
...
@@ -21,7 +21,7 @@ from paddle.fluid import core
__all__
=
[]
__all__
=
[]
class
PyLayerContext
(
object
):
class
Legacy
PyLayerContext
(
object
):
"""
"""
The object of this class is a context that is used in PyLayer to enhance the function.
The object of this class is a context that is used in PyLayer to enhance the function.
...
@@ -181,7 +181,7 @@ class CPyLayer(object):
...
@@ -181,7 +181,7 @@ class CPyLayer(object):
return
core
.
pylayer_apply
(
place
,
cls
,
*
args
,
**
kwargs
)
return
core
.
pylayer_apply
(
place
,
cls
,
*
args
,
**
kwargs
)
class
PyLayerBackward
(
PyLayerContext
):
class
PyLayerBackward
(
Legacy
PyLayerContext
):
def
backward
(
self
,
*
args
,
**
kwargs
):
def
backward
(
self
,
*
args
,
**
kwargs
):
with
paddle
.
fluid
.
dygraph
.
guard
():
with
paddle
.
fluid
.
dygraph
.
guard
():
...
@@ -205,7 +205,7 @@ class LayerMeta(type):
...
@@ -205,7 +205,7 @@ class LayerMeta(type):
return
super
(
LayerMeta
,
cls
).
__init__
(
name
,
bases
,
attrs
)
return
super
(
LayerMeta
,
cls
).
__init__
(
name
,
bases
,
attrs
)
class
PyLayer
(
with_mateclass
(
LayerMeta
,
CPyLayer
)):
class
Legacy
PyLayer
(
with_mateclass
(
LayerMeta
,
CPyLayer
)):
"""
"""
Build a custom `Layer` by creating subclasses. Subclasses need to follow the following rules:
Build a custom `Layer` by creating subclasses. Subclasses need to follow the following rules:
1. Subclasses contain `forward` and `backward` function. Both forward and backward are @staticmethod.
1. Subclasses contain `forward` and `backward` function. Both forward and backward are @staticmethod.
...
@@ -425,6 +425,8 @@ class EagerPyLayerContext(object):
...
@@ -425,6 +425,8 @@ class EagerPyLayerContext(object):
Examples:
Examples:
.. code-block:: python
.. code-block:: python
import os
os.environ['FLAGS_enable_eager_mode'] = '1'
import paddle
import paddle
from paddle.autograd import PyLayer
from paddle.autograd import PyLayer
import numpy as np
import numpy as np
...
@@ -464,6 +466,8 @@ class EagerPyLayerContext(object):
...
@@ -464,6 +466,8 @@ class EagerPyLayerContext(object):
Examples:
Examples:
.. code-block:: python
.. code-block:: python
import os
os.environ['FLAGS_enable_eager_mode'] = '1'
import paddle
import paddle
from paddle.autograd import PyLayer
from paddle.autograd import PyLayer
import numpy as np
import numpy as np
...
...
python/paddle/distributed/collective.py
浏览文件 @
a5dc0a79
...
@@ -1181,9 +1181,9 @@ def _mp_allreduce(tensor,
...
@@ -1181,9 +1181,9 @@ def _mp_allreduce(tensor,
if
in_dygraph_mode
():
if
in_dygraph_mode
():
assert
op
==
ReduceOp
.
SUM
,
"Unknown parameter: {}."
.
format
(
op
)
assert
op
==
ReduceOp
.
SUM
,
"Unknown parameter: {}."
.
format
(
op
)
from
paddle.autograd
import
Eager
PyLayer
from
paddle.autograd
import
PyLayer
class
mp_allreduce_eager
(
Eager
PyLayer
):
class
mp_allreduce_eager
(
PyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
tensor
,
use_calc_stream
,
ring_id
,
def
forward
(
ctx
,
tensor
,
use_calc_stream
,
ring_id
,
...
...
python/paddle/distributed/fleet/base/fleet_base.py
浏览文件 @
a5dc0a79
...
@@ -37,7 +37,7 @@ from ..meta_optimizers import HybridParallelOptimizer, HeterParallelOptimizer
...
@@ -37,7 +37,7 @@ from ..meta_optimizers import HybridParallelOptimizer, HeterParallelOptimizer
from
paddle
import
_C_ops
from
paddle
import
_C_ops
from
paddle.fluid
import
core
from
paddle.fluid
import
core
from
paddle.fluid.dygraph
import
to_variable
from
paddle.fluid.dygraph
import
to_variable
from
paddle.distributed.fleet.utils.recompute
import
RecomputeFunction
from
paddle.distributed.fleet.utils.recompute
import
Legacy
RecomputeFunction
from
paddle.fluid.dygraph.varbase_patch_methods
import
_grad_scalar
from
paddle.fluid.dygraph.varbase_patch_methods
import
_grad_scalar
__all__
=
[]
__all__
=
[]
...
@@ -68,7 +68,8 @@ class _RecomputeModelWrapper(paddle.nn.Layer):
...
@@ -68,7 +68,8 @@ class _RecomputeModelWrapper(paddle.nn.Layer):
return
do_run
return
do_run
def
_checkpoint
(
self
,
func
,
*
args
,
**
kwargs
):
def
_checkpoint
(
self
,
func
,
*
args
,
**
kwargs
):
return
RecomputeFunction
.
apply
(
func
,
self
.
_preserve_rng_state
,
*
args
)
return
LegacyRecomputeFunction
.
apply
(
func
,
self
.
_preserve_rng_state
,
*
args
)
def
forward
(
self
,
input
):
def
forward
(
self
,
input
):
end
=
0
end
=
0
...
...
python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
浏览文件 @
a5dc0a79
...
@@ -17,7 +17,7 @@ import contextlib
...
@@ -17,7 +17,7 @@ import contextlib
import
paddle
import
paddle
from
paddle.fluid
import
core
from
paddle.fluid
import
core
from
paddle
import
_C_ops
from
paddle
import
_C_ops
from
paddle.autograd
import
PyLayer
,
EagerPyLayer
from
paddle.autograd
import
PyLayer
from
paddle.fluid
import
framework
from
paddle.fluid
import
framework
from
...utils.recompute
import
check_recompute_necessary
,
detach_variable
,
swith_rng_state_tracker
from
...utils.recompute
import
check_recompute_necessary
,
detach_variable
,
swith_rng_state_tracker
from
..parallel_layers.random
import
get_rng_state_tracker
from
..parallel_layers.random
import
get_rng_state_tracker
...
@@ -151,7 +151,7 @@ def _merge_activation(tensor):
...
@@ -151,7 +151,7 @@ def _merge_activation(tensor):
return
_all_gather
(
tensor
,
group
=
mp_group
)
return
_all_gather
(
tensor
,
group
=
mp_group
)
class
_HP
EagerRecomputeFunction
(
Eager
PyLayer
):
class
_HP
RecomputeFunction
(
PyLayer
):
"""
"""
Compared with paddle.distributed.fleet.utils.recompute, there are the following differences:
Compared with paddle.distributed.fleet.utils.recompute, there are the following differences:
1. In order to support PipeLineParallel, the input of recompute is modified to ensure that the input can be tuple type.
1. In order to support PipeLineParallel, the input of recompute is modified to ensure that the input can be tuple type.
...
@@ -256,7 +256,7 @@ class _HPEagerRecomputeFunction(EagerPyLayer):
...
@@ -256,7 +256,7 @@ class _HPEagerRecomputeFunction(EagerPyLayer):
detached_inputs
=
detach_variable
(
tuple
(
inputs
))
detached_inputs
=
detach_variable
(
tuple
(
inputs
))
outputs
=
ctx
.
run_function
(
*
detached_inputs
)
outputs
=
ctx
.
run_function
(
*
detached_inputs
)
if
isinstance
(
outputs
,
core
.
eager
.
Tensor
):
if
isinstance
(
outputs
,
(
core
.
VarBase
,
core
.
eager
.
Tensor
)
):
outputs
=
(
outputs
,
)
outputs
=
(
outputs
,
)
assert
len
(
outputs
)
==
len
(
args
)
assert
len
(
outputs
)
==
len
(
args
)
...
@@ -266,137 +266,8 @@ class _HPEagerRecomputeFunction(EagerPyLayer):
...
@@ -266,137 +266,8 @@ class _HPEagerRecomputeFunction(EagerPyLayer):
for
i
in
range
(
len
(
outputs
)):
for
i
in
range
(
len
(
outputs
)):
if
isinstance
(
if
isinstance
(
outputs
[
i
],
outputs
[
i
],
core
.
eager
.
Tensor
)
and
not
outputs
[
i
].
stop_gradient
:
(
core
.
VarBase
,
forward_outputs_with_grad
.
append
(
outputs
[
i
])
core
.
eager
.
Tensor
))
and
not
outputs
[
i
].
stop_gradient
:
backward_inputs
.
append
(
args
[
i
])
if
len
(
forward_outputs_with_grad
)
==
0
:
raise
RuntimeError
(
"none of output has stop_gradient=False, this recompute() is not necessary"
)
# actually backward
paddle
.
autograd
.
backward
(
forward_outputs_with_grad
,
backward_inputs
)
grads
=
tuple
(
inp
.
_grad_ivar
()
for
inp
in
detached_inputs
if
isinstance
(
inp
,
core
.
eager
.
Tensor
))
return
grads
class
_HPRecomputeFunction
(
PyLayer
):
"""
Compared with paddle.distributed.fleet.utils.recompute, there are the following differences:
1. In order to support PipeLineParallel, the input of recompute is modified to ensure that the input can be tuple type.
2. Offload support for activation
3. Support MP segmentation of activation to further reduce cuda memory
4. Adapt to the random state of MP
"""
@
staticmethod
def
forward
(
ctx
,
run_function
,
all_outputs
,
*
args
):
check_recompute_necessary
(
args
)
# store for recomputing
ctx
.
run_function
=
run_function
# store the rng states
ctx
.
fwd_cuda_rng_state
=
paddle
.
get_cuda_rng_state
()
ctx
.
fwd_cuda_rng_state_tracker
=
get_rng_state_tracker
(
).
get_states_tracker
()
# save input for backward
ctx
.
inputs
=
[]
ctx
.
tensor_indices
=
[]
ctx
.
tensor_shapes
=
[]
tensor_inputs
=
[]
cur_device
=
paddle
.
get_device
()
assert
'gpu:'
in
paddle
.
get_device
(
),
"Recompute with RNG is not support current device: {}."
.
format
(
cur_device
)
# TODO support AMP
tracer
=
framework
.
_dygraph_tracer
()
ctx
.
is_fw_autocast
=
False
if
tracer
.
_amp_level
==
core
.
AmpLevel
.
O0
else
True
if
tracer
.
_amp_level
==
core
.
AmpLevel
.
O2
:
ctx
.
amp_level
=
'O2'
elif
tracer
.
_amp_level
in
(
core
.
AmpLevel
.
O1
,
core
.
AmpLevel
.
O0
):
ctx
.
amp_level
=
'O1'
else
:
raise
ValueError
(
"unsupported amp level: {}"
.
format
(
tracer
.
_amp_level
))
ctx
.
amp_white_list
,
ctx
.
amp_black_list
=
tracer
.
_get_amp_op_list
()
with
paddle
.
no_grad
():
outputs
=
run_function
(
*
args
)
for
i
,
arg
in
enumerate
(
args
):
if
paddle
.
is_tensor
(
arg
):
state
=
arg
.
stop_gradient
if
_recompute_partition
:
ctx
.
tensor_shapes
.
append
(
arg
.
shape
)
partition
=
_split_activation
(
arg
.
detach
()).
clone
()
# TODO(shenliang03) not use calculate stream to D2H to speed
arg
=
partition
.
cpu
()
if
_recompute_offload
else
partition
else
:
arg
=
arg
.
cpu
()
if
_recompute_offload
else
arg
arg
.
stop_gradient
=
state
tensor_inputs
.
append
(
arg
)
ctx
.
tensor_indices
.
append
(
i
)
ctx
.
inputs
.
append
(
None
)
else
:
ctx
.
inputs
.
append
(
arg
)
ctx
.
save_for_backward
(
*
tensor_inputs
)
if
paddle
.
is_tensor
(
outputs
):
all_outputs
+=
[
outputs
]
return
outputs
else
:
all_outputs
+=
outputs
return
tuple
(
outputs
)
@
staticmethod
def
backward
(
ctx
,
*
args
):
with
paddle
.
fluid
.
dygraph
.
guard
():
# Restore inputs
inputs
=
list
(
ctx
.
inputs
)
tensor_indices
=
ctx
.
tensor_indices
tensor_shapes
=
ctx
.
tensor_shapes
tensors
=
list
(
ctx
.
saved_tensor
())
device_id
=
paddle
.
distributed
.
ParallelEnv
().
device_id
for
i
,
idx
in
enumerate
(
tensor_indices
):
if
_recompute_partition
:
state
=
tensors
[
i
].
stop_gradient
tensors
[
i
]
=
_merge_activation
(
tensors
[
i
]).
detach
().
reshape_
(
tensor_shapes
[
i
])
tensors
[
i
].
stop_gradient
=
state
inputs
[
idx
]
=
tensors
[
i
].
cuda
(
device_id
)
if
_recompute_offload
else
tensors
[
i
]
tracer
=
framework
.
_dygraph_tracer
()
tracer
.
_has_grad
=
True
# need restore auto_cast state as well as w/b list
with
swith_rng_state_tracker
(
ctx
.
fwd_cuda_rng_state
,
ctx
.
fwd_cuda_rng_state_tracker
):
with
paddle
.
amp
.
auto_cast
(
enable
=
ctx
.
is_fw_autocast
,
custom_white_list
=
ctx
.
amp_white_list
,
custom_black_list
=
ctx
.
amp_black_list
,
level
=
ctx
.
amp_level
):
detached_inputs
=
detach_variable
(
tuple
(
inputs
))
outputs
=
ctx
.
run_function
(
*
detached_inputs
)
if
isinstance
(
outputs
,
core
.
VarBase
):
outputs
=
(
outputs
,
)
assert
len
(
outputs
)
==
len
(
args
)
forward_outputs_with_grad
=
[]
backward_inputs
=
[]
for
i
in
range
(
len
(
outputs
)):
if
isinstance
(
outputs
[
i
],
core
.
VarBase
)
and
not
outputs
[
i
].
stop_gradient
:
forward_outputs_with_grad
.
append
(
outputs
[
i
])
forward_outputs_with_grad
.
append
(
outputs
[
i
])
backward_inputs
.
append
(
args
[
i
])
backward_inputs
.
append
(
args
[
i
])
...
@@ -408,7 +279,7 @@ class _HPRecomputeFunction(PyLayer):
...
@@ -408,7 +279,7 @@ class _HPRecomputeFunction(PyLayer):
# actually backward
# actually backward
paddle
.
autograd
.
backward
(
forward_outputs_with_grad
,
backward_inputs
)
paddle
.
autograd
.
backward
(
forward_outputs_with_grad
,
backward_inputs
)
grads
=
tuple
(
inp
.
_grad_ivar
()
for
inp
in
detached_inputs
grads
=
tuple
(
inp
.
_grad_ivar
()
for
inp
in
detached_inputs
if
isinstance
(
inp
,
core
.
VarBase
))
if
isinstance
(
inp
,
(
core
.
VarBase
,
core
.
eager
.
Tensor
)
))
return
grads
return
grads
...
@@ -420,10 +291,7 @@ def _hp_recompute(function, *args):
...
@@ -420,10 +291,7 @@ def _hp_recompute(function, *args):
# 3. Here, we only use float dtype to distinguish whether a gradient is needed in output tensor
# 3. Here, we only use float dtype to distinguish whether a gradient is needed in output tensor
all_outputs
=
[]
all_outputs
=
[]
if
in_dygraph_mode
():
_HPRecomputeFunction
.
apply
(
function
,
all_outputs
,
*
args
)
_HPEagerRecomputeFunction
.
apply
(
function
,
all_outputs
,
*
args
)
else
:
_HPRecomputeFunction
.
apply
(
function
,
all_outputs
,
*
args
)
if
len
(
all_outputs
)
==
1
:
if
len
(
all_outputs
)
==
1
:
return
all_outputs
[
0
]
return
all_outputs
[
0
]
...
...
python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
浏览文件 @
a5dc0a79
...
@@ -20,7 +20,7 @@ from collections import OrderedDict
...
@@ -20,7 +20,7 @@ from collections import OrderedDict
import
paddle
import
paddle
from
paddle
import
nn
from
paddle
import
nn
from
paddle.autograd
import
Eager
PyLayer
from
paddle.autograd
import
PyLayer
import
paddle.fluid.core
as
core
import
paddle.fluid.core
as
core
import
paddle.fluid.framework
as
framework
import
paddle.fluid.framework
as
framework
from
paddle.fluid.framework
import
EagerParamBase
from
paddle.fluid.framework
import
EagerParamBase
...
@@ -398,7 +398,7 @@ class GroupShardedStage3(nn.Layer):
...
@@ -398,7 +398,7 @@ class GroupShardedStage3(nn.Layer):
def
_register_forward_hooks
(
self
,
layer
):
def
_register_forward_hooks
(
self
,
layer
):
"""
"""
Register
Eager
PyLayer to manage memory slices.
Register PyLayer to manage memory slices.
There are four stages:
There are four stages:
FW
FW
1. Before the forward layers, synchronize the full parameters.
1. Before the forward layers, synchronize the full parameters.
...
@@ -653,7 +653,7 @@ def ForwardPreHooks(layer, order_tracer, trainable_params, param2buffer_size,
...
@@ -653,7 +653,7 @@ def ForwardPreHooks(layer, order_tracer, trainable_params, param2buffer_size,
return
return
class
ForwardPostHooks
(
Eager
PyLayer
):
class
ForwardPostHooks
(
PyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
inputs
,
layer
,
order_tracer
,
trainable_params
,
def
forward
(
ctx
,
inputs
,
layer
,
order_tracer
,
trainable_params
,
...
...
python/paddle/distributed/fleet/utils/recompute.py
浏览文件 @
a5dc0a79
...
@@ -14,7 +14,8 @@
...
@@ -14,7 +14,8 @@
import
paddle
import
paddle
from
paddle.fluid
import
core
from
paddle.fluid
import
core
from
paddle.autograd
import
PyLayer
,
EagerPyLayer
from
paddle.autograd
import
PyLayer
from
paddle.autograd.py_layer
import
LegacyPyLayer
from
paddle.fluid
import
framework
from
paddle.fluid
import
framework
import
contextlib
import
contextlib
...
@@ -68,7 +69,7 @@ def swith_rng_state_tracker(rng_state, tracker):
...
@@ -68,7 +69,7 @@ def swith_rng_state_tracker(rng_state, tracker):
get_rng_state_tracker
().
set_states_tracker
(
orig_cuda_rng_tracker
)
get_rng_state_tracker
().
set_states_tracker
(
orig_cuda_rng_tracker
)
class
EagerRecomputeFunction
(
Eager
PyLayer
):
class
LegacyRecomputeFunction
(
Legacy
PyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
run_function
,
preserve_rng_state
,
*
args
):
def
forward
(
ctx
,
run_function
,
preserve_rng_state
,
*
args
):
...
@@ -171,7 +172,7 @@ class EagerRecomputeFunction(EagerPyLayer):
...
@@ -171,7 +172,7 @@ class EagerRecomputeFunction(EagerPyLayer):
detached_inputs
=
detach_variable
(
tuple
(
inputs
))
detached_inputs
=
detach_variable
(
tuple
(
inputs
))
outputs
=
ctx
.
run_function
(
*
detached_inputs
)
outputs
=
ctx
.
run_function
(
*
detached_inputs
)
if
isinstance
(
outputs
,
core
.
eager
.
Tensor
):
if
isinstance
(
outputs
,
core
.
VarBase
):
outputs
=
(
outputs
,
)
outputs
=
(
outputs
,
)
assert
len
(
outputs
)
==
len
(
args
)
assert
len
(
outputs
)
==
len
(
args
)
...
@@ -183,9 +184,8 @@ class EagerRecomputeFunction(EagerPyLayer):
...
@@ -183,9 +184,8 @@ class EagerRecomputeFunction(EagerPyLayer):
# the following backward_inputs_with_grad is used to avoid this case.
# the following backward_inputs_with_grad is used to avoid this case.
backward_inputs_with_grad
=
[]
backward_inputs_with_grad
=
[]
for
i
in
range
(
len
(
outputs
)):
for
i
in
range
(
len
(
outputs
)):
if
isinstance
(
if
isinstance
(
outputs
[
i
],
outputs
[
i
],
core
.
VarBase
)
and
not
outputs
[
i
].
stop_gradient
:
core
.
eager
.
Tensor
)
and
not
outputs
[
i
].
stop_gradient
:
forward_outputs_with_grad
.
append
(
outputs
[
i
])
forward_outputs_with_grad
.
append
(
outputs
[
i
])
backward_inputs_with_grad
.
append
(
args
[
i
])
backward_inputs_with_grad
.
append
(
args
[
i
])
...
@@ -199,8 +199,8 @@ class EagerRecomputeFunction(EagerPyLayer):
...
@@ -199,8 +199,8 @@ class EagerRecomputeFunction(EagerPyLayer):
paddle
.
autograd
.
backward
(
forward_outputs_with_grad
,
paddle
.
autograd
.
backward
(
forward_outputs_with_grad
,
backward_inputs_with_grad
)
backward_inputs_with_grad
)
grads
=
tuple
(
inp
.
grad
for
inp
in
detached_inputs
grads
=
list
(
inp
.
_grad_ivar
()
for
inp
in
detached_inputs
if
isinstance
(
inp
,
core
.
eager
.
Tensor
))
if
isinstance
(
inp
,
core
.
VarBase
))
return
grads
return
grads
...
@@ -307,7 +307,7 @@ class RecomputeFunction(PyLayer):
...
@@ -307,7 +307,7 @@ class RecomputeFunction(PyLayer):
detached_inputs
=
detach_variable
(
tuple
(
inputs
))
detached_inputs
=
detach_variable
(
tuple
(
inputs
))
outputs
=
ctx
.
run_function
(
*
detached_inputs
)
outputs
=
ctx
.
run_function
(
*
detached_inputs
)
if
isinstance
(
outputs
,
core
.
VarBase
):
if
isinstance
(
outputs
,
(
core
.
VarBase
,
core
.
eager
.
Tensor
)
):
outputs
=
(
outputs
,
)
outputs
=
(
outputs
,
)
assert
len
(
outputs
)
==
len
(
args
)
assert
len
(
outputs
)
==
len
(
args
)
...
@@ -319,8 +319,10 @@ class RecomputeFunction(PyLayer):
...
@@ -319,8 +319,10 @@ class RecomputeFunction(PyLayer):
# the following backward_inputs_with_grad is used to avoid this case.
# the following backward_inputs_with_grad is used to avoid this case.
backward_inputs_with_grad
=
[]
backward_inputs_with_grad
=
[]
for
i
in
range
(
len
(
outputs
)):
for
i
in
range
(
len
(
outputs
)):
if
isinstance
(
outputs
[
i
],
if
isinstance
(
core
.
VarBase
)
and
not
outputs
[
i
].
stop_gradient
:
outputs
[
i
],
(
core
.
VarBase
,
core
.
eager
.
Tensor
))
and
not
outputs
[
i
].
stop_gradient
:
forward_outputs_with_grad
.
append
(
outputs
[
i
])
forward_outputs_with_grad
.
append
(
outputs
[
i
])
backward_inputs_with_grad
.
append
(
args
[
i
])
backward_inputs_with_grad
.
append
(
args
[
i
])
...
@@ -334,8 +336,14 @@ class RecomputeFunction(PyLayer):
...
@@ -334,8 +336,14 @@ class RecomputeFunction(PyLayer):
paddle
.
autograd
.
backward
(
forward_outputs_with_grad
,
paddle
.
autograd
.
backward
(
forward_outputs_with_grad
,
backward_inputs_with_grad
)
backward_inputs_with_grad
)
grads
=
list
(
inp
.
_grad_ivar
()
for
inp
in
detached_inputs
if
in_dygraph_mode
():
if
isinstance
(
inp
,
core
.
VarBase
))
grads
=
tuple
(
inp
.
_grad_ivar
()
for
inp
in
detached_inputs
if
isinstance
(
inp
,
(
core
.
VarBase
,
core
.
eager
.
Tensor
)))
else
:
grads
=
list
(
inp
.
_grad_ivar
()
for
inp
in
detached_inputs
if
isinstance
(
inp
,
(
core
.
VarBase
,
core
.
eager
.
Tensor
)))
return
grads
return
grads
...
@@ -465,7 +473,4 @@ def recompute(function, *args, **kwargs):
...
@@ -465,7 +473,4 @@ def recompute(function, *args, **kwargs):
if
framework
.
_dygraph_tracer
().
_has_grad
:
if
framework
.
_dygraph_tracer
().
_has_grad
:
check_recompute_necessary
(
args
)
check_recompute_necessary
(
args
)
if
in_dygraph_mode
():
return
RecomputeFunction
.
apply
(
function
,
preserve
,
*
args
)
return
EagerRecomputeFunction
.
apply
(
function
,
preserve
,
*
args
)
else
:
return
RecomputeFunction
.
apply
(
function
,
preserve
,
*
args
)
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
a5dc0a79
...
@@ -60,7 +60,9 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_sharding_parallel)
...
@@ -60,7 +60,9 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_sharding_parallel)
list
(
APPEND DIST_TEST_OPS test_dygraph_sharding_optimizer_stage2
)
list
(
APPEND DIST_TEST_OPS test_dygraph_sharding_optimizer_stage2
)
list
(
APPEND DIST_TEST_OPS test_dygraph_sharding_stage2
)
list
(
APPEND DIST_TEST_OPS test_dygraph_sharding_stage2
)
list
(
APPEND DIST_TEST_OPS test_dygraph_sharding_stage3
)
list
(
APPEND DIST_TEST_OPS test_dygraph_sharding_stage3
)
list
(
APPEND DIST_TEST_OPS test_dygraph_sharding_stage3_for_eager
)
list
(
APPEND DIST_TEST_OPS test_dygraph_group_sharded_api
)
list
(
APPEND DIST_TEST_OPS test_dygraph_group_sharded_api
)
list
(
APPEND DIST_TEST_OPS test_dygraph_group_sharded_api_for_eager
)
list
(
APPEND DIST_TEST_OPS test_auto_parallel_parallelizer
)
list
(
APPEND DIST_TEST_OPS test_auto_parallel_parallelizer
)
list
(
APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers
)
list
(
APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers
)
list
(
APPEND DIST_TEST_OPS test_hybrid_parallel_inference_helper
)
list
(
APPEND DIST_TEST_OPS test_hybrid_parallel_inference_helper
)
...
@@ -305,13 +307,17 @@ if((NOT WITH_GPU) AND (NOT WITH_ROCM))
...
@@ -305,13 +307,17 @@ if((NOT WITH_GPU) AND (NOT WITH_ROCM))
list
(
REMOVE_ITEM TEST_OPS test_dygraph_sharding_optimizer_stage2
)
list
(
REMOVE_ITEM TEST_OPS test_dygraph_sharding_optimizer_stage2
)
list
(
REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage2
)
list
(
REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage2
)
list
(
REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage3
)
list
(
REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage3
)
list
(
REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage3_for_eager
)
list
(
REMOVE_ITEM TEST_OPS test_dygraph_group_sharded_api
)
list
(
REMOVE_ITEM TEST_OPS test_dygraph_group_sharded_api
)
list
(
REMOVE_ITEM TEST_OPS test_dygraph_group_sharded_api_for_eager
)
list
(
REMOVE_ITEM TEST_OPS test_auto_parallel_parallelizer
)
list
(
REMOVE_ITEM TEST_OPS test_auto_parallel_parallelizer
)
list
(
REMOVE_ITEM TEST_OPS test_parallel_dygraph_mp_layers
)
list
(
REMOVE_ITEM TEST_OPS test_parallel_dygraph_mp_layers
)
list
(
REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision
)
list
(
REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision
)
list
(
REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision_for_eager
)
list
(
REMOVE_ITEM TEST_OPS test_mixed_precision
)
list
(
REMOVE_ITEM TEST_OPS test_mixed_precision
)
list
(
REMOVE_ITEM TEST_OPS test_fleet_base_single
)
list
(
REMOVE_ITEM TEST_OPS test_fleet_base_single
)
list
(
REMOVE_ITEM TEST_OPS test_dygraph_recompute
)
list
(
REMOVE_ITEM TEST_OPS test_dygraph_recompute
)
list
(
REMOVE_ITEM TEST_OPS test_dygraph_recompute_for_eager
)
list
(
REMOVE_ITEM TEST_OPS test_hybrid_parallel_inference_helper
)
list
(
REMOVE_ITEM TEST_OPS test_hybrid_parallel_inference_helper
)
list
(
REMOVE_ITEM TEST_OPS test_parallel_class_center_sample
)
list
(
REMOVE_ITEM TEST_OPS test_parallel_class_center_sample
)
list
(
REMOVE_ITEM TEST_OPS test_parallel_margin_cross_entropy
)
list
(
REMOVE_ITEM TEST_OPS test_parallel_margin_cross_entropy
)
...
@@ -1547,7 +1553,11 @@ if(WITH_DISTRIBUTE
...
@@ -1547,7 +1553,11 @@ if(WITH_DISTRIBUTE
120
)
120
)
set_tests_properties
(
test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 200
)
set_tests_properties
(
test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 200
)
set_tests_properties
(
test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 350
)
set_tests_properties
(
test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 350
)
set_tests_properties
(
test_dygraph_sharding_stage3_for_eager PROPERTIES TIMEOUT
350
)
set_tests_properties
(
test_dygraph_group_sharded_api PROPERTIES TIMEOUT 120
)
set_tests_properties
(
test_dygraph_group_sharded_api PROPERTIES TIMEOUT 120
)
set_tests_properties
(
test_dygraph_group_sharded_api_for_eager
PROPERTIES TIMEOUT 120
)
set_tests_properties
(
test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120
)
set_tests_properties
(
test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120
)
set_tests_properties
(
test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120
)
set_tests_properties
(
test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120
)
set_tests_properties
(
test_hybrid_parallel_inference_helper PROPERTIES TIMEOUT
set_tests_properties
(
test_hybrid_parallel_inference_helper PROPERTIES TIMEOUT
...
@@ -1637,6 +1647,8 @@ endif()
...
@@ -1637,6 +1647,8 @@ endif()
if
(
WITH_GPU OR WITH_ROCM
)
if
(
WITH_GPU OR WITH_ROCM
)
set_tests_properties
(
test_imperative_auto_mixed_precision PROPERTIES TIMEOUT
set_tests_properties
(
test_imperative_auto_mixed_precision PROPERTIES TIMEOUT
300
)
300
)
set_tests_properties
(
test_imperative_auto_mixed_precision_for_eager
PROPERTIES TIMEOUT 300
)
set_tests_properties
(
test_parallel_dygraph_sync_batch_norm PROPERTIES TIMEOUT
set_tests_properties
(
test_parallel_dygraph_sync_batch_norm PROPERTIES TIMEOUT
120
)
120
)
set_tests_properties
(
test_rank_attention_op PROPERTIES TIMEOUT 120
)
set_tests_properties
(
test_rank_attention_op PROPERTIES TIMEOUT 120
)
...
...
python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_with_pylayer.py
浏览文件 @
a5dc0a79
...
@@ -21,7 +21,7 @@ import paddle
...
@@ -21,7 +21,7 @@ import paddle
import
numpy
as
np
import
numpy
as
np
import
paddle.distributed
as
dist
import
paddle.distributed
as
dist
from
paddle.fluid.dygraph.nn
import
Linear
from
paddle.fluid.dygraph.nn
import
Linear
from
paddle.autograd
import
PyLayer
,
EagerPyLayer
from
paddle.autograd
import
PyLayer
from
paddle.fluid.framework
import
in_dygraph_mode
,
_in_legacy_dygraph
from
paddle.fluid.framework
import
in_dygraph_mode
,
_in_legacy_dygraph
from
paddle.distributed.fleet.utils.hybrid_parallel_util
import
fused_allreduce_gradients
from
paddle.distributed.fleet.utils.hybrid_parallel_util
import
fused_allreduce_gradients
...
@@ -45,21 +45,6 @@ class cus_tanh(PyLayer):
...
@@ -45,21 +45,6 @@ class cus_tanh(PyLayer):
return
grad
return
grad
class
cus_tanh_eager
(
EagerPyLayer
):
@
staticmethod
def
forward
(
ctx
,
x
):
y
=
paddle
.
tanh
(
x
)
ctx
.
save_for_backward
(
y
)
return
y
@
staticmethod
def
backward
(
ctx
,
dy
):
y
,
=
ctx
.
saved_tensor
()
grad
=
dy
*
(
1
-
paddle
.
square
(
y
))
return
grad
class
SimpleNet
(
paddle
.
nn
.
Layer
):
class
SimpleNet
(
paddle
.
nn
.
Layer
):
def
__init__
(
self
,
train_id
,
model_id
):
def
__init__
(
self
,
train_id
,
model_id
):
...
@@ -73,10 +58,7 @@ class SimpleNet(paddle.nn.Layer):
...
@@ -73,10 +58,7 @@ class SimpleNet(paddle.nn.Layer):
def
forward
(
self
,
inputs
):
def
forward
(
self
,
inputs
):
if
self
.
model_id
==
0
:
if
self
.
model_id
==
0
:
if
in_dygraph_mode
():
inputs
=
cus_tanh
.
apply
(
inputs
)
inputs
=
cus_tanh_eager
.
apply
(
inputs
)
elif
_in_legacy_dygraph
():
inputs
=
cus_tanh
.
apply
(
inputs
)
else
:
else
:
inputs
=
self
.
tanh
(
inputs
)
inputs
=
self
.
tanh
(
inputs
)
...
...
python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py
浏览文件 @
a5dc0a79
...
@@ -15,6 +15,9 @@
...
@@ -15,6 +15,9 @@
from
__future__
import
print_function
from
__future__
import
print_function
import
os
import
os
os
.
environ
[
'FLAGS_enable_eager_mode'
]
=
'0'
import
unittest
import
unittest
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
...
@@ -26,9 +29,7 @@ class TestDygraphGroupSharded(TestMultipleGpus):
...
@@ -26,9 +29,7 @@ class TestDygraphGroupSharded(TestMultipleGpus):
# check group sharded logic as well as the accuracy with single mode
# check group sharded logic as well as the accuracy with single mode
def
test_dygraph_group_sharded
(
self
):
def
test_dygraph_group_sharded
(
self
):
self
.
run_mnist_2gpu
(
'dygraph_group_sharded_api.py'
,
eager_mode
=
False
)
self
.
run_mnist_2gpu
(
'dygraph_group_sharded_api.py'
,
eager_mode
=
False
)
self
.
run_mnist_2gpu
(
'dygraph_group_sharded_api_eager.py'
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
os
.
environ
[
"FLAGS_enable_eager_mode"
]
=
"1"
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api_for_eager.py
0 → 100644
浏览文件 @
a5dc0a79
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
os
os
.
environ
[
'FLAGS_enable_eager_mode'
]
=
'1'
import
unittest
import
paddle.fluid
as
fluid
from
test_parallel_dygraph_dataparallel
import
TestMultipleGpus
class
TestDygraphGroupSharded
(
TestMultipleGpus
):
# check group sharded logic as well as the accuracy with single mode
def
test_dygraph_group_sharded
(
self
):
self
.
run_mnist_2gpu
(
'dygraph_group_sharded_api_eager.py'
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_dygraph_recompute.py
浏览文件 @
a5dc0a79
...
@@ -23,7 +23,6 @@ from paddle.distributed.fleet.utils import recompute
...
@@ -23,7 +23,6 @@ from paddle.distributed.fleet.utils import recompute
import
random
import
random
import
paddle.fluid.layers
as
layers
import
paddle.fluid.layers
as
layers
from
paddle.fluid.framework
import
_test_eager_guard
def
get_fc_block
(
block_idx
,
input_size
,
is_last
=
False
):
def
get_fc_block
(
block_idx
,
input_size
,
is_last
=
False
):
...
@@ -181,34 +180,15 @@ class TestPyLayer(unittest.TestCase):
...
@@ -181,34 +180,15 @@ class TestPyLayer(unittest.TestCase):
check_identical
(
loss_ref
,
param_ref
,
grad_ref
,
loss
,
param
,
grad
)
check_identical
(
loss_ref
,
param_ref
,
grad_ref
,
loss
,
param
,
grad
)
def
test_fc_net_with_dropout
(
self
):
def
test_fc_net_with_dropout
(
self
):
with
_test_eager_guard
():
self
.
test_base_case
()
self
.
test_base_case
()
self
.
test_base_case
()
def
test_fc_net_without_restore_rng
(
self
):
with
_test_eager_guard
():
loss_ref
,
param_ref
,
grad_ref
=
run_model
(
recompute_block
=
[
2
],
recompute_kwargs
=
{
"preserve_rng_state"
:
False
},
enable_autocast
=
True
)
def
test_fc_net_with_amp
(
self
):
def
test_fc_net_with_amp
(
self
):
with
_test_eager_guard
():
self
.
test_base_case
(
enable_autocast
=
True
)
self
.
test_base_case
(
enable_autocast
=
True
)
self
.
test_base_case
(
enable_autocast
=
True
)
def
test_fc_net_with_fp16
(
self
):
def
test_fc_net_with_fp16
(
self
):
with
_test_eager_guard
():
self
.
test_base_case
(
enable_autocast
=
True
,
pure_fp16
=
True
)
self
.
test_base_case
(
enable_autocast
=
True
,
pure_fp16
=
True
)
self
.
test_base_case
(
enable_autocast
=
True
,
pure_fp16
=
True
)
def
test_recompute_kwargs
(
self
):
def
test_recompute_kwargs
(
self
):
with
_test_eager_guard
():
paddle
.
set_device
(
"gpu"
)
kwargs
=
{
"is_test"
:
False
}
with
self
.
assertRaises
(
ValueError
):
loss_ref
,
param_ref
,
grad_ref
=
run_model
(
recompute_block
=
[
2
],
recompute_kwargs
=
kwargs
)
paddle
.
set_device
(
"gpu"
)
paddle
.
set_device
(
"gpu"
)
kwargs
=
{
"is_test"
:
False
}
kwargs
=
{
"is_test"
:
False
}
with
self
.
assertRaises
(
ValueError
):
with
self
.
assertRaises
(
ValueError
):
...
@@ -216,11 +196,6 @@ class TestPyLayer(unittest.TestCase):
...
@@ -216,11 +196,6 @@ class TestPyLayer(unittest.TestCase):
recompute_kwargs
=
kwargs
)
recompute_kwargs
=
kwargs
)
def
test_recompute_cpu_rng
(
self
):
def
test_recompute_cpu_rng
(
self
):
with
_test_eager_guard
():
paddle
.
set_device
(
"cpu"
)
with
self
.
assertRaises
(
RuntimeError
):
loss_ref
,
param_ref
,
grad_ref
=
run_model
(
recompute_block
=
[
2
])
paddle
.
set_device
(
"cpu"
)
paddle
.
set_device
(
"cpu"
)
with
self
.
assertRaises
(
RuntimeError
):
with
self
.
assertRaises
(
RuntimeError
):
loss_ref
,
param_ref
,
grad_ref
=
run_model
(
recompute_block
=
[
2
])
loss_ref
,
param_ref
,
grad_ref
=
run_model
(
recompute_block
=
[
2
])
...
...
python/paddle/fluid/tests/unittests/test_dygraph_recompute_for_eager.py
0 → 100755
浏览文件 @
a5dc0a79
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
os
os
.
environ
[
'FLAGS_enable_eager_mode'
]
=
'1'
import
unittest
import
numpy
as
np
import
paddle
from
paddle.autograd
import
PyLayer
from
paddle.distributed.fleet.utils
import
recompute
import
random
import
paddle.fluid.layers
as
layers
def
get_fc_block
(
block_idx
,
input_size
,
is_last
=
False
):
block_name
=
"block_"
+
str
(
block_idx
)
block
=
paddle
.
nn
.
Sequential
(
(
block_name
+
"_fc_0"
,
paddle
.
nn
.
Linear
(
input_size
,
input_size
,
bias_attr
=
False
)),
(
block_name
+
"_dropout"
,
paddle
.
nn
.
Dropout
(
p
=
0.5
)),
(
block_name
+
"_relu_1"
,
paddle
.
nn
.
ReLU
()),
(
block_name
+
"_fc_1"
,
paddle
.
nn
.
Linear
(
input_size
,
input_size
,
bias_attr
=
False
)),
(
block_name
+
"_relu_2"
,
paddle
.
nn
.
ReLU
()),
)
if
is_last
:
block
.
add_sublayer
(
block_name
+
"_fc_2"
,
paddle
.
nn
.
Linear
(
input_size
,
1
,
bias_attr
=
False
))
# add sublayer
else
:
block
.
add_sublayer
(
block_name
+
"_fc_2"
,
paddle
.
nn
.
Linear
(
input_size
,
input_size
,
bias_attr
=
False
))
# add sublayer
return
block
class
Naive_fc_net
(
paddle
.
nn
.
Layer
):
def
__init__
(
self
,
input_size
=
10
,
recompute_blocks
=
[
1
,
3
],
recompute_kwargs
=
{}):
super
(
Naive_fc_net
,
self
).
__init__
()
self
.
recompute_blocks
=
recompute_blocks
self
.
recompute_kwargs
=
recompute_kwargs
self
.
runfunc0
=
get_fc_block
(
0
,
input_size
,
is_last
=
False
)
self
.
runfunc1
=
get_fc_block
(
1
,
input_size
,
is_last
=
False
)
self
.
runfunc2
=
get_fc_block
(
2
,
input_size
,
is_last
=
False
)
self
.
runfunc3
=
get_fc_block
(
3
,
input_size
,
is_last
=
False
)
self
.
runfunc4
=
get_fc_block
(
4
,
input_size
,
is_last
=
True
)
def
forward
(
self
,
inputs
):
if
0
in
self
.
recompute_blocks
:
inputs
=
recompute
(
self
.
runfunc0
,
inputs
)
else
:
inputs
=
self
.
runfunc0
(
inputs
)
if
1
in
self
.
recompute_blocks
:
inputs
=
recompute
(
self
.
runfunc1
,
inputs
)
else
:
inputs
=
self
.
runfunc1
(
inputs
)
if
2
in
self
.
recompute_blocks
:
inputs
=
recompute
(
self
.
runfunc2
,
inputs
,
**
self
.
recompute_kwargs
)
else
:
inputs
=
self
.
runfunc2
(
inputs
)
if
3
in
self
.
recompute_blocks
:
inputs
=
recompute
(
self
.
runfunc3
,
inputs
)
else
:
inputs
=
self
.
runfunc3
(
inputs
)
if
4
in
self
.
recompute_blocks
:
inputs
=
recompute
(
self
.
runfunc4
,
inputs
)
else
:
inputs
=
self
.
runfunc4
(
inputs
)
return
inputs
def
run_model
(
recompute_block
=
[],
recompute_kwargs
=
{},
enable_autocast
=
False
,
pure_fp16
=
False
):
gen
=
paddle
.
seed
(
10
)
gen
.
manual_seed
(
10
)
np
.
random
.
seed
(
10
)
random
.
seed
(
10
)
batch_size
,
input_size
=
1
,
10
model
=
Naive_fc_net
(
input_size
,
recompute_blocks
=
recompute_block
,
recompute_kwargs
=
recompute_kwargs
)
loss_fn
=
paddle
.
nn
.
MSELoss
(
reduction
=
'mean'
)
optimizer
=
paddle
.
optimizer
.
SGD
(
learning_rate
=
0.01
,
parameters
=
model
.
parameters
())
if
enable_autocast
:
scaler
=
paddle
.
amp
.
GradScaler
()
loss_
=
[]
param_
=
[]
grad_
=
[]
for
step
in
range
(
10
):
x_data
=
np
.
random
.
randn
(
batch_size
,
input_size
).
astype
(
np
.
float32
)
x
=
paddle
.
to_tensor
(
x_data
)
# x.stop_gradient = False
level
=
'O2'
if
pure_fp16
else
'O1'
with
paddle
.
amp
.
auto_cast
(
True
,
level
=
level
):
y_pred
=
model
(
x
)
loss
=
y_pred
.
mean
()
if
enable_autocast
:
scaler
.
scale
(
loss
).
backward
()
scaler
.
minimize
(
optimizer
,
loss
)
else
:
loss_
.
append
(
np
.
asarray
(
loss
).
tolist
())
loss
.
backward
()
optimizer
.
step
()
param_
.
append
(
np
.
asarray
(
model
.
parameters
()[
9
]).
tolist
())
grad_
.
append
(
np
.
asarray
(
model
.
parameters
()[
3
].
_grad_ivar
()).
tolist
())
optimizer
.
clear_grad
()
return
loss_
,
param_
,
grad_
class
TestPyLayer
(
unittest
.
TestCase
):
def
test_base_case
(
self
,
enable_autocast
=
False
,
pure_fp16
=
False
):
def
check_identical
(
loss_ref
,
param_ref
,
grad_ref
,
loss
,
param
,
grad
):
self
.
assertEqual
(
loss_ref
,
loss
)
self
.
assertEqual
(
param_ref
,
param
)
self
.
assertEqual
(
grad_ref
,
grad
)
# without recompute
loss_ref
,
param_ref
,
grad_ref
=
run_model
(
recompute_block
=
[],
enable_autocast
=
enable_autocast
,
pure_fp16
=
pure_fp16
)
# recompute second block
loss
,
param
,
grad
=
run_model
(
recompute_block
=
[
1
],
enable_autocast
=
enable_autocast
,
pure_fp16
=
pure_fp16
)
check_identical
(
loss_ref
,
param_ref
,
grad_ref
,
loss
,
param
,
grad
)
# recompute fourth block
loss
,
param
,
grad
=
run_model
(
recompute_block
=
[
3
],
enable_autocast
=
enable_autocast
,
pure_fp16
=
pure_fp16
)
check_identical
(
loss_ref
,
param_ref
,
grad_ref
,
loss
,
param
,
grad
)
# recompute second to fourth block
loss
,
param
,
grad
=
run_model
(
recompute_block
=
[
1
,
2
,
3
],
enable_autocast
=
enable_autocast
,
pure_fp16
=
pure_fp16
)
check_identical
(
loss_ref
,
param_ref
,
grad_ref
,
loss
,
param
,
grad
)
# recompute second & fourth block
loss
,
param
,
grad
=
run_model
(
recompute_block
=
[
1
,
3
],
enable_autocast
=
enable_autocast
,
pure_fp16
=
pure_fp16
)
check_identical
(
loss_ref
,
param_ref
,
grad_ref
,
loss
,
param
,
grad
)
def
test_fc_net_with_dropout
(
self
):
self
.
test_base_case
()
def
test_fc_net_without_restore_rng
(
self
):
loss_ref
,
param_ref
,
grad_ref
=
run_model
(
recompute_block
=
[
2
],
recompute_kwargs
=
{
"preserve_rng_state"
:
False
},
enable_autocast
=
True
)
def
test_fc_net_with_amp
(
self
):
self
.
test_base_case
(
enable_autocast
=
True
)
def
test_fc_net_with_fp16
(
self
):
self
.
test_base_case
(
enable_autocast
=
True
,
pure_fp16
=
True
)
def
test_recompute_kwargs
(
self
):
paddle
.
set_device
(
"gpu"
)
kwargs
=
{
"is_test"
:
False
}
with
self
.
assertRaises
(
ValueError
):
loss_ref
,
param_ref
,
grad_ref
=
run_model
(
recompute_block
=
[
2
],
recompute_kwargs
=
kwargs
)
def
test_recompute_cpu_rng
(
self
):
paddle
.
set_device
(
"cpu"
)
with
self
.
assertRaises
(
RuntimeError
):
loss_ref
,
param_ref
,
grad_ref
=
run_model
(
recompute_block
=
[
2
])
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py
浏览文件 @
a5dc0a79
...
@@ -15,6 +15,9 @@
...
@@ -15,6 +15,9 @@
from
__future__
import
print_function
from
__future__
import
print_function
import
os
import
os
os
.
environ
[
'FLAGS_enable_eager_mode'
]
=
'0'
import
unittest
import
unittest
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
...
@@ -25,15 +28,12 @@ class TestDygraphShardingStage3(TestMultipleGpus):
...
@@ -25,15 +28,12 @@ class TestDygraphShardingStage3(TestMultipleGpus):
# check sharding logic as well as the accuracy with single mode
# check sharding logic as well as the accuracy with single mode
def
test_dygraph_sharding_stage3
(
self
):
def
test_dygraph_sharding_stage3
(
self
):
self
.
run_mnist_2gpu
(
'dygraph_group_sharded_stage3.py'
)
self
.
run_mnist_2gpu
(
'dygraph_sharding_stage3.py'
,
eager_mode
=
False
)
self
.
run_mnist_2gpu
(
'dygraph_sharding_stage3.py'
,
eager_mode
=
False
)
def
test_dygraph_sharding_stage3_offload
(
self
):
def
test_dygraph_sharding_stage3_offload
(
self
):
self
.
run_mnist_2gpu
(
'dygraph_group_sharded_stage3_offload.py'
)
self
.
run_mnist_2gpu
(
'dygraph_sharding_stage3_offload.py'
,
self
.
run_mnist_2gpu
(
'dygraph_sharding_stage3_offload.py'
,
eager_mode
=
False
)
eager_mode
=
False
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
os
.
environ
[
"FLAGS_enable_eager_mode"
]
=
"1"
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3_for_eager.py
0 → 100644
浏览文件 @
a5dc0a79
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
os
os
.
environ
[
'FLAGS_enable_eager_mode'
]
=
'1'
import
os
import
unittest
import
paddle.fluid
as
fluid
from
test_parallel_dygraph_dataparallel
import
TestMultipleGpus
class
TestDygraphShardingStage3
(
TestMultipleGpus
):
# check sharding logic as well as the accuracy with single mode
def
test_dygraph_sharding_stage3
(
self
):
self
.
run_mnist_2gpu
(
'dygraph_group_sharded_stage3.py'
)
def
test_dygraph_sharding_stage3_offload
(
self
):
self
.
run_mnist_2gpu
(
'dygraph_group_sharded_stage3_offload.py'
)
if
__name__
==
"__main__"
:
os
.
environ
[
"FLAGS_enable_eager_mode"
]
=
"1"
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
浏览文件 @
a5dc0a79
...
@@ -12,6 +12,10 @@
...
@@ -12,6 +12,10 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
import
os
os
.
environ
[
'FLAGS_enable_eager_mode'
]
=
'0'
import
unittest
import
unittest
import
paddle
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
...
@@ -19,13 +23,11 @@ import paddle.fluid.core as core
...
@@ -19,13 +23,11 @@ import paddle.fluid.core as core
import
numpy
as
np
import
numpy
as
np
import
six
import
six
import
cv2
import
cv2
import
os
import
tempfile
import
tempfile
from
test_imperative_resnet
import
ResNet
,
BottleneckBlock
,
ConvBNLayer
,
train_parameters
,
optimizer_setting
from
test_imperative_resnet
import
ResNet
,
BottleneckBlock
,
ConvBNLayer
,
train_parameters
,
optimizer_setting
import
paddle.nn
as
nn
import
paddle.nn
as
nn
from
paddle.static
import
InputSpec
from
paddle.static
import
InputSpec
from
paddle.autograd
import
PyLayer
from
paddle.autograd
import
PyLayer
from
paddle.fluid.framework
import
_test_eager_guard
if
fluid
.
core
.
is_compiled_with_cuda
():
if
fluid
.
core
.
is_compiled_with_cuda
():
fluid
.
set_flags
({
"FLAGS_cudnn_deterministic"
:
True
})
fluid
.
set_flags
({
"FLAGS_cudnn_deterministic"
:
True
})
...
@@ -73,8 +75,6 @@ class TestAutoCast(unittest.TestCase):
...
@@ -73,8 +75,6 @@ class TestAutoCast(unittest.TestCase):
self
.
assertTrue
(
out_fp32
.
dtype
==
fluid
.
core
.
VarDesc
.
VarType
.
FP32
)
self
.
assertTrue
(
out_fp32
.
dtype
==
fluid
.
core
.
VarDesc
.
VarType
.
FP32
)
def
test_amp_guard_white_op
(
self
):
def
test_amp_guard_white_op
(
self
):
with
_test_eager_guard
():
self
.
amp_guard_white_op
()
self
.
amp_guard_white_op
()
self
.
amp_guard_white_op
()
def
amp_guard_black_op
(
self
):
def
amp_guard_black_op
(
self
):
...
@@ -88,8 +88,6 @@ class TestAutoCast(unittest.TestCase):
...
@@ -88,8 +88,6 @@ class TestAutoCast(unittest.TestCase):
self
.
assertTrue
(
out_fp32
.
dtype
==
fluid
.
core
.
VarDesc
.
VarType
.
FP32
)
self
.
assertTrue
(
out_fp32
.
dtype
==
fluid
.
core
.
VarDesc
.
VarType
.
FP32
)
def
test_amp_guard_black_op
(
self
):
def
test_amp_guard_black_op
(
self
):
with
_test_eager_guard
():
self
.
amp_guard_black_op
()
self
.
amp_guard_black_op
()
self
.
amp_guard_black_op
()
def
custom_op_list
(
self
):
def
custom_op_list
(
self
):
...
@@ -123,8 +121,6 @@ class TestAutoCast(unittest.TestCase):
...
@@ -123,8 +121,6 @@ class TestAutoCast(unittest.TestCase):
|
{
"conv2d"
})
|
{
"conv2d"
})
def
test_custom_op_list
(
self
):
def
test_custom_op_list
(
self
):
with
_test_eager_guard
():
self
.
custom_op_list
()
self
.
custom_op_list
()
self
.
custom_op_list
()
def
custom_op_list_exception
(
self
):
def
custom_op_list_exception
(
self
):
...
@@ -145,8 +141,6 @@ class TestAutoCast(unittest.TestCase):
...
@@ -145,8 +141,6 @@ class TestAutoCast(unittest.TestCase):
self
.
assertRaises
(
ValueError
,
func
)
self
.
assertRaises
(
ValueError
,
func
)
def
test_custom_op_list_exception
(
self
):
def
test_custom_op_list_exception
(
self
):
with
_test_eager_guard
():
self
.
custom_op_list_exception
()
self
.
custom_op_list_exception
()
self
.
custom_op_list_exception
()
def
amp_guard_upsupported_fp16_op
(
self
):
def
amp_guard_upsupported_fp16_op
(
self
):
...
@@ -174,8 +168,6 @@ class TestAutoCast(unittest.TestCase):
...
@@ -174,8 +168,6 @@ class TestAutoCast(unittest.TestCase):
out_purefp16_fp32
.
dtype
==
fluid
.
core
.
VarDesc
.
VarType
.
FP32
)
out_purefp16_fp32
.
dtype
==
fluid
.
core
.
VarDesc
.
VarType
.
FP32
)
def
test_amp_guard_upsupported_fp16_op
(
self
):
def
test_amp_guard_upsupported_fp16_op
(
self
):
with
_test_eager_guard
():
self
.
amp_guard_upsupported_fp16_op
()
self
.
amp_guard_upsupported_fp16_op
()
self
.
amp_guard_upsupported_fp16_op
()
def
mode_exception
(
self
):
def
mode_exception
(
self
):
...
@@ -195,8 +187,6 @@ class TestAutoCast(unittest.TestCase):
...
@@ -195,8 +187,6 @@ class TestAutoCast(unittest.TestCase):
self
.
assertRaises
(
ValueError
,
func
)
self
.
assertRaises
(
ValueError
,
func
)
def
test_mode_exception
(
self
):
def
test_mode_exception
(
self
):
with
_test_eager_guard
():
self
.
mode_exception
()
self
.
mode_exception
()
self
.
mode_exception
()
...
@@ -212,8 +202,6 @@ class TestAmpScaler(unittest.TestCase):
...
@@ -212,8 +202,6 @@ class TestAmpScaler(unittest.TestCase):
data
.
numpy
()
*
1024
),
True
)
data
.
numpy
()
*
1024
),
True
)
def
test_scale
(
self
):
def
test_scale
(
self
):
with
_test_eager_guard
():
self
.
scale
()
self
.
scale
()
self
.
scale
()
def
minimize
(
self
):
def
minimize
(
self
):
...
@@ -265,8 +253,6 @@ class TestAmpScaler(unittest.TestCase):
...
@@ -265,8 +253,6 @@ class TestAmpScaler(unittest.TestCase):
outs_no_scaler
[
1
][
i
][
0
].
numpy
()),
True
)
outs_no_scaler
[
1
][
i
][
0
].
numpy
()),
True
)
def
test_minimize
(
self
):
def
test_minimize
(
self
):
with
_test_eager_guard
():
self
.
minimize
()
self
.
minimize
()
self
.
minimize
()
def
step
(
self
):
def
step
(
self
):
...
@@ -310,8 +296,6 @@ class TestAmpScaler(unittest.TestCase):
...
@@ -310,8 +296,6 @@ class TestAmpScaler(unittest.TestCase):
outs_no_scaler
[
i
].
numpy
()),
True
)
outs_no_scaler
[
i
].
numpy
()),
True
)
def
test_step
(
self
):
def
test_step
(
self
):
with
_test_eager_guard
():
self
.
step
()
self
.
step
()
self
.
step
()
def
nan_inf
(
self
):
def
nan_inf
(
self
):
...
@@ -344,8 +328,6 @@ class TestAmpScaler(unittest.TestCase):
...
@@ -344,8 +328,6 @@ class TestAmpScaler(unittest.TestCase):
np
.
array_equal
(
param
.
numpy
(),
params_init
[
param
.
name
]))
np
.
array_equal
(
param
.
numpy
(),
params_init
[
param
.
name
]))
def
test_nan_inf
(
self
):
def
test_nan_inf
(
self
):
with
_test_eager_guard
():
self
.
nan_inf
()
self
.
nan_inf
()
self
.
nan_inf
()
def
step_update_exception
(
self
):
def
step_update_exception
(
self
):
...
@@ -396,8 +378,6 @@ class TestAmpScaler(unittest.TestCase):
...
@@ -396,8 +378,6 @@ class TestAmpScaler(unittest.TestCase):
self
.
assertRaises
(
RuntimeError
,
func3
)
self
.
assertRaises
(
RuntimeError
,
func3
)
def
test_step_update_exception
(
self
):
def
test_step_update_exception
(
self
):
with
_test_eager_guard
():
self
.
step_update_exception
()
self
.
step_update_exception
()
self
.
step_update_exception
()
def
test_get_and_set
(
self
):
def
test_get_and_set
(
self
):
...
@@ -578,8 +558,6 @@ class TestGradScalerStateDict(unittest.TestCase):
...
@@ -578,8 +558,6 @@ class TestGradScalerStateDict(unittest.TestCase):
self
.
assertTrue
(
self
.
assertTrue
(
np
.
allclose
(
out_use_state_dict
[
0
],
out_no_state_dict
[
0
]))
np
.
allclose
(
out_use_state_dict
[
0
],
out_no_state_dict
[
0
]))
with
_test_eager_guard
():
func_isinstance
()
func_isinstance
()
func_isinstance
()
...
@@ -742,8 +720,6 @@ class TestStateDictHookForAMP(unittest.TestCase):
...
@@ -742,8 +720,6 @@ class TestStateDictHookForAMP(unittest.TestCase):
for
key
in
param_value_ori
.
keys
():
for
key
in
param_value_ori
.
keys
():
print
(
np
.
equal
(
param_value_ori
[
key
],
param_value_now
[
key
]))
print
(
np
.
equal
(
param_value_ori
[
key
],
param_value_now
[
key
]))
with
_test_eager_guard
():
func_isinstance
()
func_isinstance
()
func_isinstance
()
...
@@ -899,8 +875,6 @@ class TestPureFp16SaveLoad(unittest.TestCase):
...
@@ -899,8 +875,6 @@ class TestPureFp16SaveLoad(unittest.TestCase):
self
.
assertTrue
(
self
.
assertTrue
(
np
.
allclose
(
out_use_save_load
[
0
],
out_no_save_load
[
0
]))
np
.
allclose
(
out_use_save_load
[
0
],
out_no_save_load
[
0
]))
with
_test_eager_guard
():
func_isinstance
()
func_isinstance
()
func_isinstance
()
...
@@ -1005,8 +979,6 @@ class TestPureFp16InferenceSaveLoad(unittest.TestCase):
...
@@ -1005,8 +979,6 @@ class TestPureFp16InferenceSaveLoad(unittest.TestCase):
def
test_inference_save_load
(
self
):
def
test_inference_save_load
(
self
):
self
.
inference_save_load
()
self
.
inference_save_load
()
with
_test_eager_guard
():
self
.
inference_save_load
()
class
TestResnet2
(
unittest
.
TestCase
):
class
TestResnet2
(
unittest
.
TestCase
):
...
@@ -1146,8 +1118,6 @@ class TestResnet2(unittest.TestCase):
...
@@ -1146,8 +1118,6 @@ class TestResnet2(unittest.TestCase):
self
.
assertTrue
(
self
.
assertTrue
(
np
.
allclose
(
out_fp32
[
0
],
out_pure_fp16
[
0
],
atol
=
1.e-2
))
np
.
allclose
(
out_fp32
[
0
],
out_pure_fp16
[
0
],
atol
=
1.e-2
))
with
_test_eager_guard
():
func_isinstance
()
func_isinstance
()
func_isinstance
()
def
test_with_data_loader
(
self
):
def
test_with_data_loader
(
self
):
...
@@ -1166,8 +1136,6 @@ class TestResnet2(unittest.TestCase):
...
@@ -1166,8 +1136,6 @@ class TestResnet2(unittest.TestCase):
self
.
assertTrue
(
self
.
assertTrue
(
np
.
allclose
(
out_fp32
[
0
],
out_pure_fp16
[
0
],
atol
=
1.e-2
))
np
.
allclose
(
out_fp32
[
0
],
out_pure_fp16
[
0
],
atol
=
1.e-2
))
with
_test_eager_guard
():
func_isinstance
()
func_isinstance
()
func_isinstance
()
def
test_param_group
(
self
):
def
test_param_group
(
self
):
...
@@ -1189,8 +1157,6 @@ class TestResnet2(unittest.TestCase):
...
@@ -1189,8 +1157,6 @@ class TestResnet2(unittest.TestCase):
self
.
assertTrue
(
self
.
assertTrue
(
np
.
allclose
(
out_fp32
[
0
],
out_pure_fp16
[
0
],
atol
=
1.e-2
))
np
.
allclose
(
out_fp32
[
0
],
out_pure_fp16
[
0
],
atol
=
1.e-2
))
with
_test_eager_guard
():
func_isinstance
()
func_isinstance
()
func_isinstance
()
...
@@ -1285,8 +1251,6 @@ class TestResnet(unittest.TestCase):
...
@@ -1285,8 +1251,6 @@ class TestResnet(unittest.TestCase):
self
.
assertTrue
(
self
.
assertTrue
(
np
.
allclose
(
out_fp32
[
0
],
out_pure_fp16
[
0
],
atol
=
1.e-1
))
np
.
allclose
(
out_fp32
[
0
],
out_pure_fp16
[
0
],
atol
=
1.e-1
))
with
_test_eager_guard
():
func_isinstance
()
func_isinstance
()
func_isinstance
()
...
@@ -1308,8 +1272,6 @@ class TestLayerNormFp16(unittest.TestCase):
...
@@ -1308,8 +1272,6 @@ class TestLayerNormFp16(unittest.TestCase):
self
.
assertTrue
(
self
.
assertTrue
(
out
.
dtype
==
fluid
.
core
.
VarDesc
.
VarType
.
FP16
)
out
.
dtype
==
fluid
.
core
.
VarDesc
.
VarType
.
FP16
)
with
_test_eager_guard
():
func_isinstance
()
func_isinstance
()
func_isinstance
()
...
@@ -1344,8 +1306,6 @@ class TestBf16(unittest.TestCase):
...
@@ -1344,8 +1306,6 @@ class TestBf16(unittest.TestCase):
self
.
assertTrue
(
self
.
assertTrue
(
np
.
allclose
(
out_fp32
,
out_bf16_O2
,
rtol
=
1.e-3
,
atol
=
1.e-1
))
np
.
allclose
(
out_fp32
,
out_bf16_O2
,
rtol
=
1.e-3
,
atol
=
1.e-1
))
with
_test_eager_guard
():
func_isinstance
()
func_isinstance
()
func_isinstance
()
...
@@ -1399,8 +1359,6 @@ class TestAmpWithHook(unittest.TestCase):
...
@@ -1399,8 +1359,6 @@ class TestAmpWithHook(unittest.TestCase):
loss
=
a
.
sum
()
loss
=
a
.
sum
()
self
.
assertRaises
(
RuntimeError
,
loss
.
backward
)
self
.
assertRaises
(
RuntimeError
,
loss
.
backward
)
with
_test_eager_guard
():
func_isinstance
()
func_isinstance
()
func_isinstance
()
def
test_hook_change_place
(
self
):
def
test_hook_change_place
(
self
):
...
@@ -1420,8 +1378,6 @@ class TestAmpWithHook(unittest.TestCase):
...
@@ -1420,8 +1378,6 @@ class TestAmpWithHook(unittest.TestCase):
loss
=
a
.
sum
()
loss
=
a
.
sum
()
self
.
assertRaises
(
RuntimeError
,
loss
.
backward
)
self
.
assertRaises
(
RuntimeError
,
loss
.
backward
)
with
_test_eager_guard
():
func_isinstance
()
func_isinstance
()
func_isinstance
()
...
...
python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision_for_eager.py
0 → 100644
浏览文件 @
a5dc0a79
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
os
.
environ
[
'FLAGS_enable_eager_mode'
]
=
'1'
import
unittest
import
paddle
import
paddle.fluid
as
fluid
import
numpy
as
np
import
six
import
cv2
import
tempfile
from
test_imperative_resnet
import
ResNet
,
BottleneckBlock
,
ConvBNLayer
,
train_parameters
,
optimizer_setting
import
paddle.nn
as
nn
from
paddle.static
import
InputSpec
from
paddle.autograd
import
PyLayer
if
fluid
.
core
.
is_compiled_with_cuda
():
fluid
.
set_flags
({
"FLAGS_cudnn_deterministic"
:
True
})
class
SimpleConv
(
fluid
.
dygraph
.
Layer
):
def
__init__
(
self
,
num_channels
,
num_filters
,
filter_size
,
stride
=
1
,
groups
=
1
,
act
=
None
):
super
(
SimpleConv
,
self
).
__init__
()
self
.
_conv
=
fluid
.
dygraph
.
Conv2D
(
num_channels
=
num_channels
,
num_filters
=
num_filters
,
filter_size
=
filter_size
,
stride
=
stride
,
padding
=
(
filter_size
-
1
)
//
2
,
groups
=
groups
,
act
=
None
,
bias_attr
=
None
,
use_cudnn
=
True
)
def
forward
(
self
,
inputs
):
return
self
.
_conv
(
inputs
)
class
TestAutoCast
(
unittest
.
TestCase
):
def
amp_guard_white_op
(
self
):
data
=
np
.
random
.
uniform
(
-
1
,
1
,
[
10
,
3
,
32
,
32
]).
astype
(
'float32'
)
with
fluid
.
dygraph
.
guard
():
conv2d
=
fluid
.
dygraph
.
Conv2D
(
3
,
2
,
3
,
bias_attr
=
False
,
act
=
None
)
data
=
fluid
.
dygraph
.
to_variable
(
data
)
with
fluid
.
dygraph
.
amp_guard
(
True
):
out_fp16
=
conv2d
(
data
)
with
fluid
.
dygraph
.
amp_guard
(
False
):
out_fp32
=
conv2d
(
data
)
self
.
assertTrue
(
data
.
dtype
==
fluid
.
core
.
VarDesc
.
VarType
.
FP32
)
self
.
assertTrue
(
out_fp16
.
dtype
==
fluid
.
core
.
VarDesc
.
VarType
.
FP16
)
self
.
assertTrue
(
out_fp32
.
dtype
==
fluid
.
core
.
VarDesc
.
VarType
.
FP32
)
def
test_amp_guard_white_op
(
self
):
self
.
amp_guard_white_op
()
def
amp_guard_black_op
(
self
):
data
=
np
.
random
.
uniform
(
-
1
,
1
,
[
10
,
3
,
32
,
32
]).
astype
(
'float32'
)
with
fluid
.
dygraph
.
guard
():
data
=
fluid
.
dygraph
.
to_variable
(
data
)
with
fluid
.
dygraph
.
amp_guard
(
True
):
out_fp32
=
fluid
.
layers
.
mean
(
data
)
self
.
assertTrue
(
data
.
dtype
==
fluid
.
core
.
VarDesc
.
VarType
.
FP32
)
self
.
assertTrue
(
out_fp32
.
dtype
==
fluid
.
core
.
VarDesc
.
VarType
.
FP32
)
def
test_amp_guard_black_op
(
self
):
self
.
amp_guard_black_op
()
def
custom_op_list
(
self
):
with
fluid
.
dygraph
.
guard
():
tracer
=
fluid
.
framework
.
_dygraph_tracer
()
base_white_list
=
fluid
.
dygraph
.
amp
.
auto_cast
.
WHITE_LIST
base_black_list
=
fluid
.
dygraph
.
amp
.
auto_cast
.
BLACK_LIST
with
fluid
.
dygraph
.
amp_guard
(
custom_white_list
=
[
"log"
],
custom_black_list
=
[
"conv2d"
]):
white_list
,
black_list
=
tracer
.
_get_amp_op_list
()
self
.
assertTrue
(
set
(
white_list
)
==
(
set
(
base_white_list
)
|
{
"log"
})
-
{
"conv2d"
})
self
.
assertTrue
(
set
(
black_list
)
==
(
set
(
base_black_list
)
-
{
"log"
})
|
{
"conv2d"
})
base_white_list
=
fluid
.
dygraph
.
amp
.
auto_cast
.
PURE_FP16_WHITE_LIST
base_black_list
=
fluid
.
dygraph
.
amp
.
auto_cast
.
PURE_FP16_BLACK_LIST
with
fluid
.
dygraph
.
amp_guard
(
custom_white_list
=
[
"log"
],
custom_black_list
=
[
"conv2d"
],
level
=
'O2'
):
white_list
,
black_list
=
tracer
.
_get_amp_op_list
()
self
.
assertTrue
(
set
(
white_list
)
==
(
set
(
base_white_list
)
|
{
"log"
})
-
{
"conv2d"
})
self
.
assertTrue
(
set
(
black_list
)
==
(
set
(
base_black_list
)
-
{
"log"
})
|
{
"conv2d"
})
def
test_custom_op_list
(
self
):
self
.
custom_op_list
()
def
custom_op_list_exception
(
self
):
inp_np
=
np
.
random
.
random
(
size
=
[
1
,
3
,
128
,
128
]).
astype
(
np
.
float32
)
def
func
():
with
fluid
.
dygraph
.
guard
():
model
=
SimpleConv
(
num_channels
=
3
,
num_filters
=
64
,
filter_size
=
7
,
stride
=
2
,
act
=
'relu'
)
with
fluid
.
dygraph
.
amp_guard
(
custom_white_list
=
[
"conv2d"
],
custom_black_list
=
[
"conv2d"
]):
inp
=
fluid
.
dygraph
.
to_variable
(
inp_np
)
out
=
model
(
inp
)
self
.
assertRaises
(
ValueError
,
func
)
def
test_custom_op_list_exception
(
self
):
self
.
custom_op_list_exception
()
def
amp_guard_upsupported_fp16_op
(
self
):
data
=
np
.
random
.
uniform
(
-
1
,
1
,
[
10
,
3
,
32
,
32
]).
astype
(
'float32'
)
with
fluid
.
dygraph
.
guard
():
conv2d
=
fluid
.
dygraph
.
Conv2D
(
3
,
2
,
3
,
bias_attr
=
False
,
act
=
None
)
data
=
fluid
.
dygraph
.
to_variable
(
data
)
with
fluid
.
dygraph
.
amp_guard
(
True
):
out_amp_fp16
=
conv2d
(
data
)
out_amp_fp32
=
paddle
.
expand_as
(
out_amp_fp16
,
out_amp_fp16
)
# expand_as_v2 has no fp16 kernel
with
fluid
.
dygraph
.
amp_guard
(
True
,
level
=
'O2'
):
out_purefp16_fp16
=
conv2d
(
data
)
out_purefp16_fp32
=
paddle
.
expand_as
(
out_purefp16_fp16
,
out_purefp16_fp16
)
# expand_as_v2 has no fp16 kernel
self
.
assertTrue
(
data
.
dtype
==
fluid
.
core
.
VarDesc
.
VarType
.
FP32
)
self
.
assertTrue
(
out_amp_fp16
.
dtype
==
fluid
.
core
.
VarDesc
.
VarType
.
FP16
)
self
.
assertTrue
(
out_amp_fp32
.
dtype
==
fluid
.
core
.
VarDesc
.
VarType
.
FP32
)
self
.
assertTrue
(
out_purefp16_fp16
.
dtype
==
fluid
.
core
.
VarDesc
.
VarType
.
FP16
)
self
.
assertTrue
(
out_purefp16_fp32
.
dtype
==
fluid
.
core
.
VarDesc
.
VarType
.
FP32
)
def
test_amp_guard_upsupported_fp16_op
(
self
):
self
.
amp_guard_upsupported_fp16_op
()
def
mode_exception
(
self
):
def
func
():
data
=
np
.
random
.
uniform
(
-
1
,
1
,
[
10
,
3
,
32
,
32
]).
astype
(
'float32'
)
with
fluid
.
dygraph
.
guard
():
conv2d
=
fluid
.
dygraph
.
Conv2D
(
3
,
2
,
3
,
bias_attr
=
False
,
act
=
None
)
data
=
fluid
.
dygraph
.
to_variable
(
data
)
with
fluid
.
dygraph
.
amp_guard
(
level
=
'O'
):
out
=
conv2d
(
data
)
self
.
assertRaises
(
ValueError
,
func
)
def
test_mode_exception
(
self
):
self
.
mode_exception
()
class
TestAmpScaler
(
unittest
.
TestCase
):
def
scale
(
self
):
with
fluid
.
dygraph
.
guard
():
data
=
paddle
.
rand
([
10
,
1024
])
scaler
=
paddle
.
fluid
.
dygraph
.
AmpScaler
(
init_loss_scaling
=
1024
)
scaled_data
=
scaler
.
scale
(
data
)
self
.
assertEqual
(
np
.
array_equal
(
scaled_data
.
numpy
(),
data
.
numpy
()
*
1024
),
True
)
def
test_scale
(
self
):
self
.
scale
()
def
minimize
(
self
):
inp_np
=
np
.
random
.
random
(
size
=
[
1
,
3
,
128
,
128
]).
astype
(
np
.
float32
)
def
run_simple_conv
(
inp_np
,
use_scaler
=
True
):
paddle
.
seed
(
10
)
paddle
.
framework
.
random
.
_manual_program_seed
(
10
)
with
fluid
.
dygraph
.
guard
():
model
=
SimpleConv
(
num_channels
=
3
,
num_filters
=
64
,
filter_size
=
7
,
stride
=
2
,
act
=
'relu'
)
optimizer
=
fluid
.
optimizer
.
SGDOptimizer
(
learning_rate
=
0.01
,
parameter_list
=
model
.
parameters
())
scaler
=
fluid
.
dygraph
.
AmpScaler
(
init_loss_scaling
=
1024
)
data
=
fluid
.
dygraph
.
to_variable
(
inp_np
)
out
=
model
(
data
)
loss
=
fluid
.
layers
.
mean
(
out
)
if
use_scaler
:
print
(
'use scaler'
)
scaled_loss
=
scaler
.
scale
(
loss
)
scaled_loss
.
backward
()
optimize_ops
,
params_grads
=
scaler
.
minimize
(
optimizer
,
scaled_loss
)
else
:
print
(
'use no scaler'
)
loss
.
backward
()
optimize_ops
,
params_grads
=
optimizer
.
minimize
(
loss
)
return
optimize_ops
,
params_grads
outs_with_scaler
=
run_simple_conv
(
inp_np
,
use_scaler
=
True
)
outs_no_scaler
=
run_simple_conv
(
inp_np
,
use_scaler
=
False
)
self
.
assertEqual
(
outs_with_scaler
[
0
],
[])
# optimize_ops is [] in dygraph mode
self
.
assertEqual
(
outs_no_scaler
[
0
],
[])
# optimize_ops is [] in dygraph mode
for
i
in
range
(
len
(
outs_with_scaler
[
1
])):
# check each grad
self
.
assertEqual
(
np
.
allclose
(
outs_with_scaler
[
1
][
i
][
1
].
numpy
(),
outs_no_scaler
[
1
][
i
][
1
].
numpy
()),
True
)
# check each parameter
self
.
assertEqual
(
np
.
allclose
(
outs_with_scaler
[
1
][
i
][
0
].
numpy
(),
outs_no_scaler
[
1
][
i
][
0
].
numpy
()),
True
)
def
test_minimize
(
self
):
self
.
minimize
()
def
step
(
self
):
inp_np
=
np
.
random
.
random
(
size
=
[
1
,
3
,
128
,
128
]).
astype
(
np
.
float32
)
def
run_simple_conv
(
inp_np
,
use_scaler
=
True
):
paddle
.
seed
(
10
)
paddle
.
framework
.
random
.
_manual_program_seed
(
10
)
with
fluid
.
dygraph
.
guard
():
model
=
SimpleConv
(
num_channels
=
3
,
num_filters
=
64
,
filter_size
=
7
,
stride
=
2
,
act
=
'relu'
)
optimizer
=
paddle
.
optimizer
.
SGD
(
learning_rate
=
0.01
,
parameters
=
model
.
parameters
())
scaler
=
paddle
.
amp
.
GradScaler
(
init_loss_scaling
=
1024
)
data
=
fluid
.
dygraph
.
to_variable
(
inp_np
)
out
=
model
(
data
)
loss
=
fluid
.
layers
.
mean
(
out
)
if
use_scaler
:
print
(
'use scaler'
)
scaled_loss
=
scaler
.
scale
(
loss
)
scaled_loss
.
backward
()
scaler
.
step
(
optimizer
)
scaler
.
update
()
else
:
print
(
'use no scaler'
)
loss
.
backward
()
optimizer
.
step
()
return
optimizer
.
_parameter_list
outs_with_scaler
=
run_simple_conv
(
inp_np
,
use_scaler
=
True
)
outs_no_scaler
=
run_simple_conv
(
inp_np
,
use_scaler
=
False
)
for
i
in
range
(
len
(
outs_with_scaler
)):
# check each parameter
self
.
assertEqual
(
np
.
allclose
(
outs_with_scaler
[
i
].
numpy
(),
outs_no_scaler
[
i
].
numpy
()),
True
)
def
test_step
(
self
):
self
.
step
()
def
nan_inf
(
self
):
inp_np
=
np
.
random
.
random
(
size
=
[
1
,
3
,
128
,
128
]).
astype
(
np
.
float32
)
inp_np
[
0
][
1
][
2
][
3
]
=
np
.
nan
with
fluid
.
dygraph
.
guard
():
model
=
SimpleConv
(
num_channels
=
3
,
num_filters
=
64
,
filter_size
=
7
,
stride
=
2
,
act
=
'relu'
)
params_init
=
{}
for
param
in
model
.
parameters
():
params_init
[
param
.
name
]
=
param
.
numpy
()
optimizer
=
fluid
.
optimizer
.
SGDOptimizer
(
learning_rate
=
0.01
,
parameter_list
=
model
.
parameters
())
scaler
=
fluid
.
dygraph
.
AmpScaler
(
init_loss_scaling
=
1024
)
data
=
fluid
.
dygraph
.
to_variable
(
inp_np
)
out
=
model
(
data
)
loss
=
fluid
.
layers
.
mean
(
out
)
scaled_loss
=
scaler
.
scale
(
loss
)
scaled_loss
.
backward
()
optimize_ops
,
params_grads
=
scaler
.
minimize
(
optimizer
,
scaled_loss
)
self
.
assertEqual
(
scaler
.
_found_inf
.
numpy
()
==
1
,
True
)
for
param
in
model
.
parameters
():
# param not update when tensor contains nan or inf
self
.
assertTrue
(
np
.
array_equal
(
param
.
numpy
(),
params_init
[
param
.
name
]))
def
test_nan_inf
(
self
):
self
.
nan_inf
()
def
step_update_exception
(
self
):
def
func1
():
model
=
paddle
.
nn
.
Conv2D
(
3
,
2
,
3
,
bias_attr
=
True
)
optimizer
=
paddle
.
optimizer
.
SGD
(
learning_rate
=
0.01
,
parameters
=
model
.
parameters
())
scaler
=
paddle
.
amp
.
GradScaler
(
init_loss_scaling
=
1024
)
data
=
paddle
.
rand
([
10
,
3
,
32
,
32
])
conv
=
model
(
data
)
loss
=
paddle
.
mean
(
conv
)
scaled
=
scaler
.
scale
(
loss
)
scaled
.
backward
()
scaler
.
unscale_
(
optimizer
)
scaler
.
unscale_
(
optimizer
)
self
.
assertRaises
(
RuntimeError
,
func1
)
def
func2
():
model
=
paddle
.
nn
.
Conv2D
(
3
,
2
,
3
,
bias_attr
=
True
)
optimizer
=
paddle
.
optimizer
.
SGD
(
learning_rate
=
0.01
,
parameters
=
model
.
parameters
())
scaler
=
paddle
.
amp
.
GradScaler
(
init_loss_scaling
=
1024
)
data
=
paddle
.
rand
([
10
,
3
,
32
,
32
])
conv
=
model
(
data
)
loss
=
paddle
.
mean
(
conv
)
scaled
=
scaler
.
scale
(
loss
)
scaled
.
backward
()
scaler
.
step
(
optimizer
)
scaler
.
unscale_
(
optimizer
)
self
.
assertRaises
(
RuntimeError
,
func2
)
def
func3
():
model
=
paddle
.
nn
.
Conv2D
(
3
,
2
,
3
,
bias_attr
=
True
)
optimizer
=
paddle
.
optimizer
.
SGD
(
learning_rate
=
0.01
,
parameters
=
model
.
parameters
())
scaler
=
paddle
.
amp
.
GradScaler
(
init_loss_scaling
=
1024
)
data
=
paddle
.
rand
([
10
,
3
,
32
,
32
])
conv
=
model
(
data
)
loss
=
paddle
.
mean
(
conv
)
scaled
=
scaler
.
scale
(
loss
)
scaled
.
backward
()
scaler
.
step
(
optimizer
)
scaler
.
step
(
optimizer
)
self
.
assertRaises
(
RuntimeError
,
func3
)
def
test_step_update_exception
(
self
):
self
.
step_update_exception
()
def
test_get_and_set
(
self
):
with
fluid
.
dygraph
.
guard
():
scaler
=
paddle
.
amp
.
GradScaler
(
enable
=
True
,
init_loss_scaling
=
1024
,
incr_ratio
=
2.0
,
decr_ratio
=
0.5
,
incr_every_n_steps
=
1000
,
decr_every_n_nan_or_inf
=
2
,
use_dynamic_loss_scaling
=
True
)
self
.
assertEqual
(
scaler
.
is_enable
()
==
True
,
True
)
self
.
assertEqual
(
scaler
.
get_init_loss_scaling
()
==
1024
,
True
)
self
.
assertEqual
(
scaler
.
get_incr_ratio
()
==
2.0
,
True
)
self
.
assertEqual
(
scaler
.
get_decr_ratio
()
==
0.5
,
True
)
self
.
assertEqual
(
scaler
.
get_incr_every_n_steps
()
==
1000
,
True
)
self
.
assertEqual
(
scaler
.
get_decr_every_n_nan_or_inf
()
==
2
,
True
)
self
.
assertEqual
(
scaler
.
is_use_dynamic_loss_scaling
()
==
True
,
True
)
scaler
.
set_decr_every_n_nan_or_inf
(
4
)
self
.
assertEqual
(
scaler
.
get_decr_every_n_nan_or_inf
()
==
4
,
True
)
scaler
.
set_decr_ratio
(
0.1
)
self
.
assertEqual
(
scaler
.
get_decr_ratio
()
==
0.1
,
True
)
scaler
.
set_incr_every_n_steps
(
200
)
self
.
assertEqual
(
scaler
.
get_incr_every_n_steps
()
==
200
,
True
)
scaler
.
set_incr_ratio
(
3.0
)
self
.
assertEqual
(
scaler
.
get_incr_ratio
()
==
3.0
,
True
)
scaler
.
set_init_loss_scaling
(
100
)
self
.
assertEqual
(
scaler
.
get_init_loss_scaling
()
==
100
,
True
)
def
test_state_dict_and_load_state_dict
(
self
):
with
fluid
.
dygraph
.
guard
():
scaler1
=
paddle
.
amp
.
GradScaler
(
enable
=
True
,
init_loss_scaling
=
14
,
incr_ratio
=
233.0
,
decr_ratio
=
0.523
,
incr_every_n_steps
=
1090
,
decr_every_n_nan_or_inf
=
20
,
use_dynamic_loss_scaling
=
True
)
scaler_state
=
scaler1
.
state_dict
()
scaler2
=
paddle
.
amp
.
GradScaler
(
enable
=
True
)
scaler2
.
load_state_dict
(
scaler_state
)
self
.
assertEqual
(
scaler2
.
get_init_loss_scaling
()
==
14
,
True
)
self
.
assertEqual
(
scaler2
.
get_incr_ratio
()
==
233.0
,
True
)
self
.
assertEqual
(
scaler2
.
get_decr_ratio
()
==
0.523
,
True
)
self
.
assertEqual
(
scaler2
.
get_incr_every_n_steps
()
==
1090
,
True
)
self
.
assertEqual
(
scaler2
.
get_decr_every_n_nan_or_inf
()
==
20
,
True
)
scaler3
=
paddle
.
amp
.
GradScaler
(
enable
=
False
)
scaler3
.
load_state_dict
(
scaler_state
)
self
.
assertEqual
(
scaler3
.
is_enable
()
==
False
,
True
)
def
test_state_dict_and_load_state_dict_error
(
self
):
def
test_error
():
state_empty
=
{}
scaler
=
paddle
.
amp
.
GradScaler
(
enable
=
True
)
scaler
.
load_state_dict
(
state_empty
)
self
.
assertRaises
(
RuntimeError
,
test_error
)
def
reader_decorator
(
reader
):
def
__reader__
():
for
item
in
reader
():
img
=
np
.
array
(
item
[
0
]).
astype
(
'float32'
).
reshape
(
3
,
224
,
224
)
label
=
np
.
array
(
item
[
1
]).
astype
(
'int64'
).
reshape
(
1
)
yield
img
,
label
return
__reader__
class
TestGradScalerStateDict
(
unittest
.
TestCase
):
def
train_resnet
(
self
,
enable_amp
=
True
,
use_data_loader
=
True
,
use_save_load
=
True
):
seed
=
90
batch_size
=
train_parameters
[
"batch_size"
]
batch_num
=
4
paddle
.
seed
(
seed
)
paddle
.
framework
.
random
.
_manual_program_seed
(
seed
)
resnet
=
ResNet
(
use_cudnn
=
True
)
optimizer
=
optimizer_setting
(
train_parameters
,
parameter_list
=
resnet
.
parameters
())
np
.
random
.
seed
(
seed
)
train_reader
=
paddle
.
batch
(
paddle
.
dataset
.
flowers
.
train
(
use_xmap
=
False
),
batch_size
=
batch_size
)
dy_param_init_value
=
{}
for
param
in
resnet
.
parameters
():
dy_param_init_value
[
param
.
name
]
=
param
.
numpy
()
program
=
None
scaler
=
paddle
.
amp
.
GradScaler
(
enable
=
enable_amp
,
init_loss_scaling
=
2.
**
10
)
if
use_data_loader
:
train_reader
=
paddle
.
batch
(
reader_decorator
(
paddle
.
dataset
.
flowers
.
train
(
use_xmap
=
False
)),
batch_size
=
batch_size
,
drop_last
=
True
)
train_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
capacity
=
4
,
use_double_buffer
=
True
,
iterable
=
True
,
return_list
=
True
)
train_loader
.
set_sample_list_generator
(
train_reader
)
train_reader
=
train_loader
for
batch_id
,
data
in
enumerate
(
train_reader
()):
if
batch_id
>=
batch_num
:
break
if
use_data_loader
:
img
,
label
=
data
else
:
dy_x_data
=
np
.
array
([
x
[
0
].
reshape
(
3
,
224
,
224
)
for
x
in
data
]).
astype
(
'float32'
)
if
len
(
np
.
array
([
x
[
1
]
for
x
in
data
]).
astype
(
'int64'
))
!=
batch_size
:
continue
y_data
=
np
.
array
([
x
[
1
]
for
x
in
data
]).
astype
(
'int64'
).
reshape
(
-
1
,
1
)
img
=
paddle
.
to_tensor
(
dy_x_data
)
label
=
paddle
.
to_tensor
(
y_data
)
label
.
stop_gradient
=
True
with
paddle
.
amp
.
auto_cast
(
enable
=
enable_amp
):
out
=
resnet
(
img
)
loss
=
paddle
.
nn
.
functional
.
cross_entropy
(
input
=
out
,
label
=
label
)
avg_loss
=
paddle
.
mean
(
x
=
loss
)
dy_out
=
avg_loss
.
numpy
()
scaled_loss
=
scaler
.
scale
(
avg_loss
)
scaled_loss
.
backward
()
scaler
.
minimize
(
optimizer
,
scaled_loss
)
dy_grad_value
=
{}
for
param
in
resnet
.
parameters
():
if
param
.
trainable
:
np_array
=
np
.
array
(
param
.
_grad_ivar
().
value
().
get_tensor
())
dy_grad_value
[
param
.
name
+
fluid
.
core
.
grad_var_suffix
()]
=
np_array
resnet
.
clear_gradients
()
dy_param_value
=
{}
for
param
in
resnet
.
parameters
():
dy_param_value
[
param
.
name
]
=
param
.
numpy
()
if
use_save_load
and
batch_id
==
2
:
paddle
.
save
(
scaler
.
state_dict
(),
'ResNet_model.pdparams'
)
dict_load
=
paddle
.
load
(
'ResNet_model.pdparams'
)
scaler
.
load_state_dict
(
dict_load
)
if
use_data_loader
:
train_reader
.
_reset
()
return
dy_out
,
dy_param_value
,
dy_grad_value
def
test_with_state_dict
(
self
):
def
func_isinstance
():
with
fluid
.
dygraph
.
guard
():
out_use_state_dict
=
self
.
train_resnet
(
enable_amp
=
True
,
use_data_loader
=
True
,
use_save_load
=
True
)
out_no_state_dict
=
self
.
train_resnet
(
enable_amp
=
True
,
use_data_loader
=
True
,
use_save_load
=
False
)
print
(
'save_load:'
,
out_use_state_dict
[
0
],
out_no_state_dict
[
0
])
self
.
assertTrue
(
np
.
allclose
(
out_use_state_dict
[
0
],
out_no_state_dict
[
0
]))
func_isinstance
()
class
TestAmpDecorator
(
unittest
.
TestCase
):
def
test_mode_exception
(
self
):
def
func
():
with
fluid
.
dygraph
.
guard
():
model
=
fluid
.
dygraph
.
Conv2D
(
3
,
2
,
3
,
bias_attr
=
False
,
act
=
None
)
opt
=
paddle
.
optimizer
.
SGD
(
parameters
=
model
.
parameters
())
model
,
opt
=
paddle
.
amp
.
decorate
(
models
=
model
,
optimizers
=
opt
,
level
=
'O'
)
self
.
assertRaises
(
ValueError
,
func
)
def
test_input_type_exception
(
self
):
def
test_error_model
():
class
MyModel
(
object
):
def
__init__
(
self
):
print
(
"A fake Model"
)
model
=
MyModel
()
with
fluid
.
dygraph
.
guard
():
paddle
.
amp
.
decorate
(
models
=
model
,
optimizers
=
None
,
level
=
'O2'
)
self
.
assertRaises
(
TypeError
,
test_error_model
)
def
test_error_distributed_model
():
model
=
fluid
.
dygraph
.
Conv2D
(
3
,
2
,
3
,
bias_attr
=
False
,
act
=
None
)
model
=
paddle
.
DataParallel
(
model
)
with
fluid
.
dygraph
.
guard
():
model
=
paddle
.
amp
.
decorate
(
models
=
model
,
level
=
'O2'
)
self
.
assertRaises
(
RuntimeError
,
test_error_distributed_model
)
def
test_error_optimizer
():
class
MyOptimizer
(
object
):
def
__init__
(
self
):
print
(
"A fake Optimizer"
)
model
=
fluid
.
dygraph
.
Conv2D
(
3
,
2
,
3
,
bias_attr
=
False
,
act
=
None
)
opt
=
MyOptimizer
()
with
fluid
.
dygraph
.
guard
():
paddle
.
amp
.
decorate
(
models
=
model
,
optimizers
=
opt
,
level
=
'O2'
)
self
.
assertRaises
(
TypeError
,
test_error_optimizer
)
def
test_set_master_weight
(
self
):
model1
=
fluid
.
dygraph
.
Conv2D
(
3
,
2
,
3
,
bias_attr
=
False
,
act
=
None
)
opt1
=
paddle
.
optimizer
.
Adam
(
learning_rate
=
0.0001
,
parameters
=
model1
.
parameters
(),
multi_precision
=
True
)
model2
=
fluid
.
dygraph
.
Conv2D
(
3
,
2
,
3
,
bias_attr
=
False
,
act
=
None
)
opt2
=
paddle
.
optimizer
.
Adam
(
learning_rate
=
0.0001
,
parameters
=
model2
.
parameters
(),
multi_precision
=
False
)
model1
,
opt1
=
paddle
.
amp
.
decorate
(
models
=
model1
,
optimizers
=
opt1
,
level
=
'O2'
,
master_weight
=
None
)
self
.
assertEqual
(
opt1
.
_multi_precision
,
True
)
models
,
opt2
=
paddle
.
amp
.
decorate
(
models
=
[
model1
,
model2
],
optimizers
=
opt2
,
level
=
'O2'
,
master_weight
=
None
)
self
.
assertEqual
(
opt2
.
_multi_precision
,
True
)
model3
=
fluid
.
dygraph
.
Conv2D
(
3
,
2
,
3
,
bias_attr
=
False
,
act
=
None
)
opt3
=
paddle
.
optimizer
.
Adam
(
learning_rate
=
0.0001
,
parameters
=
model3
.
parameters
())
model4
=
fluid
.
dygraph
.
Conv2D
(
3
,
2
,
3
,
bias_attr
=
False
,
act
=
None
)
opt4
=
paddle
.
optimizer
.
Adam
(
learning_rate
=
0.0001
,
parameters
=
model4
.
parameters
())
model3
,
opts
=
paddle
.
amp
.
decorate
(
models
=
model3
,
optimizers
=
[
opt3
,
opt4
],
level
=
'O2'
,
master_weight
=
True
)
self
.
assertEqual
(
opts
[
0
].
_multi_precision
,
True
)
self
.
assertEqual
(
opts
[
1
].
_multi_precision
,
True
)
models
=
[
model3
,
model4
]
optimizers
=
[
opt3
,
opt4
]
models
,
optimizers
=
paddle
.
amp
.
decorate
(
models
=
models
,
optimizers
=
optimizers
,
level
=
'O2'
,
master_weight
=
False
)
self
.
assertEqual
(
optimizers
[
0
].
_multi_precision
,
False
)
self
.
assertEqual
(
optimizers
[
1
].
_multi_precision
,
False
)
def
test_skip_BatchNorm_Layer_norm
(
self
):
model
=
paddle
.
nn
.
LayerNorm
(
1
)
model
=
paddle
.
amp
.
decorate
(
models
=
model
,
level
=
'O2'
)
for
param
in
model
.
parameters
():
self
.
assertEqual
((
param
.
dtype
==
paddle
.
float32
),
True
)
model
=
paddle
.
nn
.
BatchNorm
(
1
)
model
=
paddle
.
amp
.
decorate
(
models
=
model
,
level
=
'O2'
)
for
param
in
model
.
parameters
():
self
.
assertEqual
((
param
.
dtype
==
paddle
.
float32
),
True
)
model
=
paddle
.
nn
.
BatchNorm1D
(
1
)
model
=
paddle
.
amp
.
decorate
(
models
=
model
,
level
=
'O2'
)
for
param
in
model
.
parameters
():
self
.
assertEqual
((
param
.
dtype
==
paddle
.
float32
),
True
)
model
=
paddle
.
nn
.
BatchNorm2D
(
1
)
model
=
paddle
.
amp
.
decorate
(
models
=
model
,
level
=
'O2'
)
for
param
in
model
.
parameters
():
self
.
assertEqual
((
param
.
dtype
==
paddle
.
float32
),
True
)
model
=
paddle
.
nn
.
BatchNorm3D
(
1
)
model
=
paddle
.
amp
.
decorate
(
models
=
model
,
level
=
'O2'
)
for
param
in
model
.
parameters
():
self
.
assertEqual
((
param
.
dtype
==
paddle
.
float32
),
True
)
class
TestStateDictHookForAMP
(
unittest
.
TestCase
):
def
test_state_dict_hook
(
self
):
def
func_isinstance
():
paddle
.
seed
(
100
)
model
=
paddle
.
nn
.
Linear
(
2
,
4
)
model
=
paddle
.
amp
.
decorate
(
models
=
model
,
level
=
'O2'
,
save_dtype
=
'float32'
)
param_value_ori
=
{}
for
param
in
model
.
parameters
():
param_value_ori
[
param
.
name
]
=
param
.
numpy
()
state_dict
=
model
.
state_dict
()
for
key
,
value
in
state_dict
.
items
():
state_dict
[
key
]
=
value
.
cast
(
"float16"
)
model
.
set_state_dict
(
state_dict
)
param_value_now
=
{}
for
param
in
model
.
parameters
():
param_value_now
[
param
.
name
]
=
param
.
numpy
()
for
key
in
param_value_ori
.
keys
():
print
(
np
.
equal
(
param_value_ori
[
key
],
param_value_now
[
key
]))
func_isinstance
()
class
TestPureFp16SaveLoad
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
temp_dir
=
tempfile
.
TemporaryDirectory
()
def
tearDown
(
self
):
self
.
temp_dir
.
cleanup
()
def
test_save_dtype_exception
(
self
):
def
func
():
paddle
.
disable_static
()
model
=
fluid
.
dygraph
.
Conv2D
(
3
,
2
,
3
,
bias_attr
=
False
,
act
=
None
)
opt
=
paddle
.
optimizer
.
SGD
(
parameters
=
model
.
parameters
())
paddle
.
amp
.
decorate
(
models
=
model
,
optimizers
=
opt
,
level
=
'O2'
,
save_dtype
=
'int'
)
self
.
assertRaises
(
ValueError
,
func
)
def
train_resnet
(
self
,
enable_amp
=
True
,
use_data_loader
=
True
,
use_save_load
=
True
):
seed
=
90
batch_size
=
train_parameters
[
"batch_size"
]
batch_num
=
4
paddle
.
seed
(
seed
)
paddle
.
framework
.
random
.
_manual_program_seed
(
seed
)
resnet
=
ResNet
(
use_cudnn
=
True
)
optimizer
=
optimizer_setting
(
train_parameters
,
parameter_list
=
resnet
.
parameters
())
np
.
random
.
seed
(
seed
)
train_reader
=
paddle
.
batch
(
paddle
.
dataset
.
flowers
.
train
(
use_xmap
=
False
),
batch_size
=
batch_size
)
dy_param_init_value
=
{}
for
param
in
resnet
.
parameters
():
dy_param_init_value
[
param
.
name
]
=
param
.
numpy
()
program
=
None
scaler
=
paddle
.
amp
.
GradScaler
(
enable
=
enable_amp
,
init_loss_scaling
=
2.
**
10
)
if
use_data_loader
:
train_reader
=
paddle
.
batch
(
reader_decorator
(
paddle
.
dataset
.
flowers
.
train
(
use_xmap
=
False
)),
batch_size
=
batch_size
,
drop_last
=
True
)
train_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
capacity
=
4
,
use_double_buffer
=
True
,
iterable
=
True
,
return_list
=
True
)
train_loader
.
set_sample_list_generator
(
train_reader
)
train_reader
=
train_loader
if
enable_amp
:
resnet
,
optimizer
=
paddle
.
amp
.
decorate
(
models
=
resnet
,
optimizers
=
optimizer
,
level
=
'O2'
,
save_dtype
=
'float32'
)
for
batch_id
,
data
in
enumerate
(
train_reader
()):
if
batch_id
>=
batch_num
:
break
if
use_data_loader
:
img
,
label
=
data
else
:
dy_x_data
=
np
.
array
([
x
[
0
].
reshape
(
3
,
224
,
224
)
for
x
in
data
]).
astype
(
'float32'
)
if
len
(
np
.
array
([
x
[
1
]
for
x
in
data
]).
astype
(
'int64'
))
!=
batch_size
:
continue
y_data
=
np
.
array
([
x
[
1
]
for
x
in
data
]).
astype
(
'int64'
).
reshape
(
-
1
,
1
)
img
=
paddle
.
to_tensor
(
dy_x_data
)
label
=
paddle
.
to_tensor
(
y_data
)
label
.
stop_gradient
=
True
with
paddle
.
amp
.
auto_cast
(
enable
=
enable_amp
,
level
=
'O2'
):
out
=
resnet
(
img
)
loss
=
paddle
.
nn
.
functional
.
cross_entropy
(
input
=
out
,
label
=
label
)
loss
=
paddle
.
cast
(
loss
,
'float32'
)
avg_loss
=
paddle
.
mean
(
x
=
loss
)
dy_out
=
avg_loss
.
numpy
()
scaled_loss
=
scaler
.
scale
(
avg_loss
)
scaled_loss
.
backward
()
scaler
.
minimize
(
optimizer
,
scaled_loss
)
dy_grad_value
=
{}
for
param
in
resnet
.
parameters
():
if
param
.
trainable
:
np_array
=
np
.
array
(
param
.
_grad_ivar
().
value
().
get_tensor
())
dy_grad_value
[
param
.
name
+
fluid
.
core
.
grad_var_suffix
()]
=
np_array
resnet
.
clear_gradients
()
dy_param_value
=
{}
for
param
in
resnet
.
parameters
():
dy_param_value
[
param
.
name
]
=
param
.
numpy
()
if
use_save_load
and
batch_id
==
2
:
# paddle.save
obj
=
{
'model'
:
resnet
.
state_dict
(),
'opt'
:
optimizer
.
state_dict
(),
'scaler'
:
scaler
.
state_dict
()
}
path
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
'model.pdparams'
)
paddle
.
save
(
obj
,
path
)
# paddle.load
obj_load
=
paddle
.
load
(
path
)
resnet
=
ResNet
(
use_cudnn
=
True
)
optimizer
=
optimizer_setting
(
train_parameters
,
parameter_list
=
resnet
.
parameters
())
resnet
.
set_state_dict
(
obj_load
[
'model'
])
optimizer
.
set_state_dict
(
obj_load
[
'opt'
])
scaler
.
load_state_dict
(
obj_load
[
'scaler'
])
resnet
,
optimizer
=
paddle
.
amp
.
decorate
(
models
=
resnet
,
optimizers
=
optimizer
,
level
=
'O2'
,
save_dtype
=
'float32'
)
if
use_data_loader
:
train_reader
.
_reset
()
return
dy_out
,
dy_param_value
,
dy_grad_value
def
test_with_save_load
(
self
):
def
func_isinstance
():
with
fluid
.
dygraph
.
guard
():
out_use_save_load
=
self
.
train_resnet
(
enable_amp
=
True
,
use_data_loader
=
True
,
use_save_load
=
True
)
out_no_save_load
=
self
.
train_resnet
(
enable_amp
=
True
,
use_data_loader
=
True
,
use_save_load
=
False
)
print
(
'save_load:'
,
out_use_save_load
[
0
],
out_no_save_load
[
0
])
self
.
assertTrue
(
np
.
allclose
(
out_use_save_load
[
0
],
out_no_save_load
[
0
]))
func_isinstance
()
class
TestPureFp16InferenceSaveLoad
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
temp_dir
=
tempfile
.
TemporaryDirectory
()
def
tearDown
(
self
):
self
.
temp_dir
.
cleanup
()
def
inference_save_load
(
self
):
BATCH_SIZE
=
16
BATCH_NUM
=
4
EPOCH_NUM
=
4
IMAGE_SIZE
=
784
CLASS_NUM
=
10
# define a random dataset
class
RandomDataset
(
paddle
.
io
.
Dataset
):
def
__init__
(
self
,
num_samples
):
self
.
num_samples
=
num_samples
def
__getitem__
(
self
,
idx
):
image
=
np
.
random
.
random
([
IMAGE_SIZE
]).
astype
(
'float32'
)
label
=
np
.
random
.
randint
(
0
,
CLASS_NUM
-
1
,
(
1
,
)).
astype
(
'int64'
)
return
image
,
label
def
__len__
(
self
):
return
self
.
num_samples
class
LinearNet
(
nn
.
Layer
):
def
__init__
(
self
):
super
(
LinearNet
,
self
).
__init__
()
self
.
_linear
=
nn
.
Linear
(
IMAGE_SIZE
,
CLASS_NUM
)
def
forward
(
self
,
x
):
return
self
.
_linear
(
x
)
def
train
(
layer
,
loader
,
loss_fn
,
opt
):
for
epoch_id
in
range
(
EPOCH_NUM
):
for
batch_id
,
(
image
,
label
)
in
enumerate
(
loader
()):
with
paddle
.
amp
.
auto_cast
(
enable
=
True
,
custom_white_list
=
None
,
custom_black_list
=
None
,
level
=
'O2'
):
out
=
layer
(
image
)
loss
=
loss_fn
(
out
,
label
)
loss
.
backward
()
opt
.
step
()
opt
.
clear_grad
()
# train
layer
=
LinearNet
()
adam
=
paddle
.
optimizer
.
Adam
(
learning_rate
=
0.001
,
parameters
=
layer
.
parameters
(),
multi_precision
=
True
)
loss_fn
=
nn
.
CrossEntropyLoss
()
layer
,
adam
=
paddle
.
amp
.
decorate
(
models
=
layer
,
optimizers
=
adam
,
save_dtype
=
'float32'
)
dataset
=
RandomDataset
(
BATCH_NUM
*
BATCH_SIZE
)
loader
=
paddle
.
io
.
DataLoader
(
dataset
,
batch_size
=
BATCH_SIZE
,
shuffle
=
True
,
drop_last
=
True
,
num_workers
=
2
)
train
(
layer
,
loader
,
loss_fn
,
adam
)
# save
path
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
'example_model/linear'
)
paddle
.
jit
.
save
(
layer
,
path
,
input_spec
=
[
InputSpec
(
shape
=
[
IMAGE_SIZE
],
name
=
'x'
)])
# jit.load
loaded_layer
=
paddle
.
jit
.
load
(
path
)
# inference
loaded_layer
.
eval
()
x
=
np
.
random
.
randn
(
1
,
IMAGE_SIZE
).
astype
(
'float32'
)
x_tensor
=
paddle
.
to_tensor
(
x
)
pred
=
loaded_layer
(
x_tensor
)
# load_inference_model
paddle
.
enable_static
()
exe
=
paddle
.
static
.
Executor
()
[
inference_program
,
feed_target_names
,
fetch_targets
]
=
(
paddle
.
static
.
load_inference_model
(
path
,
exe
))
tensor_img
=
x
results
=
exe
.
run
(
inference_program
,
feed
=
{
feed_target_names
[
0
]:
tensor_img
},
fetch_list
=
fetch_targets
)
print
(
"pred.numpy()"
,
pred
.
numpy
())
print
(
"result"
,
results
[
0
])
self
.
assertTrue
(
np
.
array_equal
(
pred
.
numpy
(),
results
[
0
]))
paddle
.
disable_static
()
def
test_inference_save_load
(
self
):
self
.
inference_save_load
()
class
TestResnet2
(
unittest
.
TestCase
):
"""
Use paddle-2.0 API
"""
def
train_resnet
(
self
,
enable_amp
=
True
,
level
=
'O1'
,
use_data_loader
=
False
,
use_param_group
=
False
):
seed
=
90
batch_size
=
train_parameters
[
"batch_size"
]
batch_num
=
10
paddle
.
seed
(
seed
)
paddle
.
framework
.
random
.
_manual_program_seed
(
seed
)
resnet
=
ResNet
(
use_cudnn
=
True
)
if
use_param_group
:
conv_params
=
resnet
.
conv
.
parameters
()
other_params
=
[]
for
p
in
resnet
.
parameters
():
contains
=
False
for
q
in
conv_params
:
if
p
is
q
:
contains
=
True
if
not
contains
:
other_params
.
append
(
p
)
# NOTE(zhiqiu): The Membership test operations(in / not in) calls "is" and "equal",
# see details: https://docs.python.org/3/reference/expressions.html#membership-test-operations.
# So do not use other_params = [p for p in resnet.parameters() if p not in conv_params]
optimizer
=
paddle
.
optimizer
.
Momentum
(
parameters
=
[{
'params'
:
conv_params
,
'learning_rate'
:
0.01
},
{
'params'
:
other_params
,
'learning_rate'
:
0.001
}],
multi_precision
=
True
)
else
:
optimizer
=
paddle
.
optimizer
.
SGD
(
parameters
=
resnet
.
parameters
())
np
.
random
.
seed
(
seed
)
train_reader
=
paddle
.
batch
(
paddle
.
dataset
.
flowers
.
train
(
use_xmap
=
False
),
batch_size
=
batch_size
)
dy_param_init_value
=
{}
for
param
in
resnet
.
parameters
():
dy_param_init_value
[
param
.
name
]
=
param
.
numpy
()
program
=
None
scaler
=
paddle
.
amp
.
GradScaler
(
enable
=
enable_amp
,
init_loss_scaling
=
2.
**
10
)
if
use_data_loader
:
train_reader
=
paddle
.
batch
(
reader_decorator
(
paddle
.
dataset
.
flowers
.
train
(
use_xmap
=
False
)),
batch_size
=
batch_size
,
drop_last
=
True
)
train_loader
=
fluid
.
io
.
DataLoader
.
from_generator
(
capacity
=
4
,
use_double_buffer
=
True
,
iterable
=
True
,
return_list
=
True
)
train_loader
.
set_sample_list_generator
(
train_reader
)
train_reader
=
train_loader
if
enable_amp
and
(
level
==
'O2'
):
resnet
=
paddle
.
amp
.
decorate
(
models
=
resnet
,
level
=
'O2'
)
for
batch_id
,
data
in
enumerate
(
train_reader
()):
if
batch_id
>=
batch_num
:
break
if
use_data_loader
:
img
,
label
=
data
else
:
dy_x_data
=
np
.
array
([
x
[
0
].
reshape
(
3
,
224
,
224
)
for
x
in
data
]).
astype
(
'float32'
)
if
len
(
np
.
array
([
x
[
1
]
for
x
in
data
]).
astype
(
'int64'
))
!=
batch_size
:
continue
y_data
=
np
.
array
([
x
[
1
]
for
x
in
data
]).
astype
(
'int64'
).
reshape
(
-
1
,
1
)
img
=
paddle
.
to_tensor
(
dy_x_data
)
label
=
paddle
.
to_tensor
(
y_data
)
label
.
stop_gradient
=
True
with
paddle
.
amp
.
auto_cast
(
enable
=
enable_amp
,
level
=
level
):
out
=
resnet
(
img
)
loss
=
paddle
.
nn
.
functional
.
cross_entropy
(
input
=
out
,
label
=
label
)
loss
=
paddle
.
cast
(
loss
,
'float32'
)
avg_loss
=
paddle
.
mean
(
x
=
loss
)
dy_out
=
avg_loss
.
numpy
()
scaled_loss
=
scaler
.
scale
(
avg_loss
)
scaled_loss
.
backward
()
scaler
.
unscale_
(
optimizer
)
scaler
.
step
(
optimizer
)
scaler
.
update
()
dy_grad_value
=
{}
for
param
in
resnet
.
parameters
():
if
param
.
trainable
:
np_array
=
np
.
array
(
param
.
_grad_ivar
().
value
().
get_tensor
())
dy_grad_value
[
param
.
name
+
fluid
.
core
.
grad_var_suffix
()]
=
np_array
resnet
.
clear_gradients
()
dy_param_value
=
{}
for
param
in
resnet
.
parameters
():
dy_param_value
[
param
.
name
]
=
param
.
numpy
()
if
use_data_loader
:
train_reader
.
_reset
()
return
dy_out
,
dy_param_value
,
dy_grad_value
def
test_resnet
(
self
):
def
func_isinstance
():
with
fluid
.
dygraph
.
guard
():
out_fp32
=
self
.
train_resnet
(
enable_amp
=
False
)
out_amp
=
self
.
train_resnet
(
enable_amp
=
True
)
out_pure_fp16
=
self
.
train_resnet
(
enable_amp
=
True
,
level
=
'O2'
)
print
(
out_fp32
[
0
],
out_amp
[
0
],
out_pure_fp16
[
0
])
self
.
assertTrue
(
np
.
allclose
(
out_fp32
[
0
],
out_amp
[
0
],
atol
=
1.e-5
))
self
.
assertTrue
(
np
.
allclose
(
out_fp32
[
0
],
out_pure_fp16
[
0
],
atol
=
1.e-2
))
func_isinstance
()
def
test_with_data_loader
(
self
):
def
func_isinstance
():
with
fluid
.
dygraph
.
guard
():
out_fp32
=
self
.
train_resnet
(
enable_amp
=
False
,
use_data_loader
=
True
)
out_amp
=
self
.
train_resnet
(
enable_amp
=
True
,
use_data_loader
=
True
)
out_pure_fp16
=
self
.
train_resnet
(
enable_amp
=
True
,
use_data_loader
=
True
,
level
=
'O2'
)
print
(
out_fp32
[
0
],
out_amp
[
0
],
out_pure_fp16
[
0
])
self
.
assertTrue
(
np
.
allclose
(
out_fp32
[
0
],
out_amp
[
0
],
atol
=
1.e-5
))
self
.
assertTrue
(
np
.
allclose
(
out_fp32
[
0
],
out_pure_fp16
[
0
],
atol
=
1.e-2
))
func_isinstance
()
def
test_param_group
(
self
):
def
func_isinstance
():
with
fluid
.
dygraph
.
guard
():
out_fp32
=
self
.
train_resnet
(
enable_amp
=
False
,
use_data_loader
=
True
,
use_param_group
=
True
)
out_amp
=
self
.
train_resnet
(
enable_amp
=
True
,
use_data_loader
=
True
,
use_param_group
=
True
)
out_pure_fp16
=
self
.
train_resnet
(
enable_amp
=
True
,
use_data_loader
=
True
,
use_param_group
=
True
,
level
=
'O2'
)
print
(
out_fp32
[
0
],
out_amp
[
0
],
out_pure_fp16
[
0
])
self
.
assertTrue
(
np
.
allclose
(
out_fp32
[
0
],
out_amp
[
0
],
atol
=
1.e-5
))
self
.
assertTrue
(
np
.
allclose
(
out_fp32
[
0
],
out_pure_fp16
[
0
],
atol
=
1.e-2
))
func_isinstance
()
class
TestResnet
(
unittest
.
TestCase
):
"""
Use paddle-1.x API
"""
def
train_resnet
(
self
,
enable_amp
=
True
,
level
=
'O1'
):
seed
=
90
batch_size
=
train_parameters
[
"batch_size"
]
batch_num
=
1
with
fluid
.
dygraph
.
guard
():
paddle
.
seed
(
seed
)
paddle
.
framework
.
random
.
_manual_program_seed
(
seed
)
resnet
=
ResNet
(
use_cudnn
=
True
)
optimizer
=
optimizer_setting
(
train_parameters
,
parameter_list
=
resnet
.
parameters
())
optimizer
=
paddle
.
optimizer
.
Momentum
(
parameters
=
resnet
.
parameters
(),
multi_precision
=
True
)
np
.
random
.
seed
(
seed
)
train_reader
=
paddle
.
batch
(
paddle
.
dataset
.
flowers
.
train
(
use_xmap
=
False
),
batch_size
=
batch_size
)
dy_param_init_value
=
{}
for
param
in
resnet
.
parameters
():
dy_param_init_value
[
param
.
name
]
=
param
.
numpy
()
program
=
None
scaler
=
paddle
.
fluid
.
dygraph
.
AmpScaler
(
enable
=
enable_amp
,
init_loss_scaling
=
2.
**
10
)
if
enable_amp
and
(
level
==
'O2'
):
resnet
,
optimizer
=
paddle
.
fluid
.
dygraph
.
amp_decorate
(
models
=
resnet
,
optimizers
=
optimizer
,
level
=
'O2'
)
for
batch_id
,
data
in
enumerate
(
train_reader
()):
if
batch_id
>=
batch_num
:
break
dy_x_data
=
np
.
array
([
x
[
0
].
reshape
(
3
,
224
,
224
)
for
x
in
data
]).
astype
(
'float32'
)
if
len
(
np
.
array
([
x
[
1
]
for
x
in
data
]).
astype
(
'int64'
))
!=
batch_size
:
continue
y_data
=
np
.
array
([
x
[
1
]
for
x
in
data
]).
astype
(
'int64'
).
reshape
(
-
1
,
1
)
img
=
fluid
.
dygraph
.
to_variable
(
dy_x_data
)
label
=
fluid
.
dygraph
.
to_variable
(
y_data
)
label
.
stop_gradient
=
True
with
paddle
.
fluid
.
dygraph
.
amp_guard
(
enable
=
enable_amp
,
level
=
level
):
out
=
resnet
(
img
)
loss
=
fluid
.
layers
.
cross_entropy
(
input
=
out
,
label
=
label
)
avg_loss
=
fluid
.
layers
.
mean
(
x
=
loss
)
dy_out
=
avg_loss
.
numpy
()
scaled_loss
=
scaler
.
scale
(
avg_loss
)
scaled_loss
.
backward
()
scaler
.
minimize
(
optimizer
,
scaled_loss
)
dy_grad_value
=
{}
for
param
in
resnet
.
parameters
():
if
param
.
trainable
:
np_array
=
np
.
array
(
param
.
_grad_ivar
().
value
().
get_tensor
())
dy_grad_value
[
param
.
name
+
fluid
.
core
.
grad_var_suffix
()]
=
np_array
resnet
.
clear_gradients
()
dy_param_value
=
{}
for
param
in
resnet
.
parameters
():
dy_param_value
[
param
.
name
]
=
param
.
numpy
()
return
dy_out
,
dy_param_value
,
dy_grad_value
def
test_resnet
(
self
):
def
func_isinstance
():
out_fp32
=
self
.
train_resnet
(
enable_amp
=
False
)
out_amp
=
self
.
train_resnet
(
enable_amp
=
True
)
out_pure_fp16
=
self
.
train_resnet
(
enable_amp
=
True
,
level
=
'O2'
)
print
(
out_fp32
[
0
],
out_amp
[
0
],
out_pure_fp16
[
0
])
self
.
assertTrue
(
np
.
allclose
(
out_fp32
[
0
],
out_amp
[
0
],
atol
=
1.e-2
))
self
.
assertTrue
(
np
.
allclose
(
out_fp32
[
0
],
out_pure_fp16
[
0
],
atol
=
1.e-1
))
func_isinstance
()
class
TestLayerNormFp16
(
unittest
.
TestCase
):
r
''' layer_norm and batch_norm support mixed inputs, i.e., only input x is fp16
and other params are fp32.
'''
def
test_layer_norm_fp16
(
self
):
def
func_isinstance
():
if
fluid
.
is_compiled_with_cuda
():
with
fluid
.
dygraph
.
guard
(
fluid
.
CUDAPlace
(
0
)):
x
=
paddle
.
rand
([
2
,
2
,
2
,
3
])
layer_norm
=
paddle
.
nn
.
LayerNorm
(
x
.
shape
[
1
:])
with
paddle
.
amp
.
auto_cast
(
custom_white_list
=
[
'layer_norm'
]):
out
=
layer_norm
(
x
)
self
.
assertTrue
(
out
.
dtype
==
fluid
.
core
.
VarDesc
.
VarType
.
FP16
)
func_isinstance
()
class
TestBf16
(
unittest
.
TestCase
):
'''
test amp for BF16
'''
def
train
(
self
,
enable_amp
=
True
,
amp_level
=
'O1'
):
paddle
.
seed
(
100
)
input
=
paddle
.
uniform
((
2
,
4
,
8
,
8
),
dtype
=
'float32'
,
min
=-
1.
,
max
=
1.
)
conv
=
paddle
.
nn
.
Conv2D
(
4
,
6
,
(
3
,
3
))
with
paddle
.
amp
.
auto_cast
(
enable
=
enable_amp
,
level
=
amp_level
,
dtype
=
'bfloat16'
):
output
=
conv
(
input
)
output
=
output
.
cast
(
'float32'
)
return
output
.
numpy
()
def
test_bf16
(
self
):
def
func_isinstance
():
if
fluid
.
core
.
is_compiled_with_cuda
(
)
and
fluid
.
core
.
is_bfloat16_supported
(
paddle
.
CUDAPlace
(
0
)):
out_fp32
=
self
.
train
(
enable_amp
=
False
)
out_bf16_O1
=
self
.
train
(
enable_amp
=
True
,
amp_level
=
'O1'
)
out_bf16_O2
=
self
.
train
(
enable_amp
=
True
,
amp_level
=
'O2'
)
self
.
assertTrue
(
np
.
allclose
(
out_fp32
,
out_bf16_O1
,
rtol
=
1.e-3
,
atol
=
1.e-1
))
self
.
assertTrue
(
np
.
allclose
(
out_fp32
,
out_bf16_O2
,
rtol
=
1.e-3
,
atol
=
1.e-1
))
func_isinstance
()
class
TestAmpWithPyLyer
(
unittest
.
TestCase
):
def
test_pylayer
(
self
):
class
MyMM
(
PyLayer
):
@
staticmethod
def
forward
(
ctx
,
a
,
b
):
ctx
.
save_for_backward
(
a
,
b
)
return
a
.
mm
(
b
)
@
staticmethod
def
backward
(
ctx
,
grad
):
a
,
b
=
ctx
.
saved_tensor
()
# NOTE(zhiqiu): a and b is float32 now, while grad is fp16 when forward runs with auto_cast()
# thus, the mm operation raise errors because of the dtype of inputs are inconsistent before.
return
grad
.
mm
(
b
.
t
()),
a
.
t
().
mm
(
grad
)
x
=
paddle
.
rand
([
10
,
10
])
y
=
paddle
.
rand
([
10
,
10
])
x
.
stop_gradient
=
False
y
.
stop_gradient
=
False
# with paddle.amp.auto_cast():
res
=
MyMM
.
apply
(
x
,
y
)
loss
=
paddle
.
mean
(
res
)
loss
.
backward
()
class
TestAmpWithHook
(
unittest
.
TestCase
):
def
test_hook_change_dtype
(
self
):
def
func_isinstance
():
with
paddle
.
fluid
.
dygraph
.
guard
():
v
=
paddle
.
rand
([
3
,
3
])
v
.
stop_gradient
=
False
def
foo
(
grad
):
print
(
'grad'
,
grad
,
grad
.
dtype
)
# grad's dtype is float32
res
=
paddle
.
mm
(
grad
,
grad
)
# mm runs in fp16
print
(
'res'
,
res
,
res
.
dtype
)
# res's dtype is float16
return
res
v
.
register_hook
(
foo
)
with
paddle
.
amp
.
auto_cast
():
a
=
paddle
.
mm
(
v
,
v
)
loss
=
a
.
sum
()
self
.
assertRaises
(
RuntimeError
,
loss
.
backward
)
func_isinstance
()
def
test_hook_change_place
(
self
):
def
func_isinstance
():
with
paddle
.
fluid
.
dygraph
.
guard
():
v
=
paddle
.
rand
([
3
,
3
])
v
.
stop_gradient
=
False
def
foo
(
grad
):
res
=
grad
.
cpu
()
# change place
return
res
v
.
register_hook
(
foo
)
with
paddle
.
amp
.
auto_cast
():
a
=
paddle
.
mm
(
v
,
v
)
loss
=
a
.
sum
()
self
.
assertRaises
(
RuntimeError
,
loss
.
backward
)
func_isinstance
()
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_pylayer_op.py
浏览文件 @
a5dc0a79
...
@@ -18,7 +18,7 @@ import unittest
...
@@ -18,7 +18,7 @@ import unittest
import
numpy
as
np
import
numpy
as
np
import
paddle
import
paddle
from
paddle.autograd
import
PyLayer
,
EagerPyLayer
from
paddle.autograd
.py_layer
import
Legacy
PyLayer
,
EagerPyLayer
from
paddle.fluid.framework
import
_test_eager_guard
,
in_dygraph_mode
from
paddle.fluid.framework
import
_test_eager_guard
,
in_dygraph_mode
...
@@ -32,7 +32,7 @@ class TestPyLayer(unittest.TestCase):
...
@@ -32,7 +32,7 @@ class TestPyLayer(unittest.TestCase):
def
func_test_simple_pylayer_multiple_output
(
self
):
def
func_test_simple_pylayer_multiple_output
(
self
):
class
tanh
(
EagerPyLayer
if
in_dygraph_mode
()
else
PyLayer
):
class
tanh
(
EagerPyLayer
if
in_dygraph_mode
()
else
Legacy
PyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
x1
,
x2
,
func1
,
func2
=
paddle
.
square
):
def
forward
(
ctx
,
x1
,
x2
,
func1
,
func2
=
paddle
.
square
):
...
@@ -70,7 +70,7 @@ class TestPyLayer(unittest.TestCase):
...
@@ -70,7 +70,7 @@ class TestPyLayer(unittest.TestCase):
def
func_test_simple_pylayer_return_none_with_no_grad
(
self
):
def
func_test_simple_pylayer_return_none_with_no_grad
(
self
):
class
tanh
(
EagerPyLayer
if
in_dygraph_mode
()
else
PyLayer
):
class
tanh
(
EagerPyLayer
if
in_dygraph_mode
()
else
Legacy
PyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
x1
,
x2
,
func1
,
func2
=
paddle
.
square
):
def
forward
(
ctx
,
x1
,
x2
,
func1
,
func2
=
paddle
.
square
):
...
@@ -112,7 +112,7 @@ class TestPyLayer(unittest.TestCase):
...
@@ -112,7 +112,7 @@ class TestPyLayer(unittest.TestCase):
def
func_test_simple_pylayer_single_output
(
self
):
def
func_test_simple_pylayer_single_output
(
self
):
class
tanh
(
EagerPyLayer
if
in_dygraph_mode
()
else
PyLayer
):
class
tanh
(
EagerPyLayer
if
in_dygraph_mode
()
else
Legacy
PyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
x1
,
func1
,
func2
=
paddle
.
square
):
def
forward
(
ctx
,
x1
,
func1
,
func2
=
paddle
.
square
):
...
@@ -146,7 +146,7 @@ class TestPyLayer(unittest.TestCase):
...
@@ -146,7 +146,7 @@ class TestPyLayer(unittest.TestCase):
def
func_test_pylayer_num_output_match
(
self
):
def
func_test_pylayer_num_output_match
(
self
):
class
tanh
(
EagerPyLayer
if
in_dygraph_mode
()
else
PyLayer
):
class
tanh
(
EagerPyLayer
if
in_dygraph_mode
()
else
Legacy
PyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
def
forward
(
...
@@ -175,7 +175,7 @@ class TestPyLayer(unittest.TestCase):
...
@@ -175,7 +175,7 @@ class TestPyLayer(unittest.TestCase):
def
func_test_pylayer_dtype
(
self
):
def
func_test_pylayer_dtype
(
self
):
class
tanh
(
EagerPyLayer
if
in_dygraph_mode
()
else
PyLayer
):
class
tanh
(
EagerPyLayer
if
in_dygraph_mode
()
else
Legacy
PyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
x
,
dtype
):
def
forward
(
ctx
,
x
,
dtype
):
...
@@ -206,7 +206,7 @@ class TestPyLayer(unittest.TestCase):
...
@@ -206,7 +206,7 @@ class TestPyLayer(unittest.TestCase):
def
func_test_pylayer_Exception_forward
(
self
):
def
func_test_pylayer_Exception_forward
(
self
):
class
Layer_None1
(
EagerPyLayer
if
in_dygraph_mode
()
else
PyLayer
):
class
Layer_None1
(
EagerPyLayer
if
in_dygraph_mode
()
else
Legacy
PyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
*
args
):
def
forward
(
ctx
,
*
args
):
...
@@ -220,7 +220,7 @@ class TestPyLayer(unittest.TestCase):
...
@@ -220,7 +220,7 @@ class TestPyLayer(unittest.TestCase):
with
self
.
assertRaises
(
ValueError
):
with
self
.
assertRaises
(
ValueError
):
z
=
Layer_None1
.
apply
(
input1
)
z
=
Layer_None1
.
apply
(
input1
)
class
Layer_None2
(
EagerPyLayer
if
in_dygraph_mode
()
else
PyLayer
):
class
Layer_None2
(
EagerPyLayer
if
in_dygraph_mode
()
else
Legacy
PyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
*
args
):
def
forward
(
ctx
,
*
args
):
...
@@ -234,7 +234,7 @@ class TestPyLayer(unittest.TestCase):
...
@@ -234,7 +234,7 @@ class TestPyLayer(unittest.TestCase):
# return None
# return None
z
=
Layer_None2
.
apply
(
input1
)
z
=
Layer_None2
.
apply
(
input1
)
class
Layer_one1
(
EagerPyLayer
if
in_dygraph_mode
()
else
PyLayer
):
class
Layer_one1
(
EagerPyLayer
if
in_dygraph_mode
()
else
Legacy
PyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
*
args
):
def
forward
(
ctx
,
*
args
):
...
@@ -249,7 +249,7 @@ class TestPyLayer(unittest.TestCase):
...
@@ -249,7 +249,7 @@ class TestPyLayer(unittest.TestCase):
with
self
.
assertRaises
(
ValueError
):
with
self
.
assertRaises
(
ValueError
):
z
=
Layer_one1
.
apply
(
input1
)
z
=
Layer_one1
.
apply
(
input1
)
class
Layer_one2
(
EagerPyLayer
if
in_dygraph_mode
()
else
PyLayer
):
class
Layer_one2
(
EagerPyLayer
if
in_dygraph_mode
()
else
Legacy
PyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
*
args
):
def
forward
(
ctx
,
*
args
):
...
@@ -263,7 +263,7 @@ class TestPyLayer(unittest.TestCase):
...
@@ -263,7 +263,7 @@ class TestPyLayer(unittest.TestCase):
# return int
# return int
z
=
Layer_one2
.
apply
(
input1
)
z
=
Layer_one2
.
apply
(
input1
)
class
Layer_no_fw
(
EagerPyLayer
if
in_dygraph_mode
()
else
PyLayer
):
class
Layer_no_fw
(
EagerPyLayer
if
in_dygraph_mode
()
else
Legacy
PyLayer
):
@
staticmethod
@
staticmethod
def
backward
(
ctx
,
*
args
):
def
backward
(
ctx
,
*
args
):
...
@@ -280,7 +280,7 @@ class TestPyLayer(unittest.TestCase):
...
@@ -280,7 +280,7 @@ class TestPyLayer(unittest.TestCase):
def
func_test_pylayer_nograd
(
self
):
def
func_test_pylayer_nograd
(
self
):
class
tanh
(
EagerPyLayer
if
in_dygraph_mode
()
else
PyLayer
):
class
tanh
(
EagerPyLayer
if
in_dygraph_mode
()
else
Legacy
PyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
x1
,
func1
,
func2
=
paddle
.
square
,
xx
=
None
):
def
forward
(
ctx
,
x1
,
func1
,
func2
=
paddle
.
square
,
xx
=
None
):
...
@@ -305,7 +305,8 @@ class TestPyLayer(unittest.TestCase):
...
@@ -305,7 +305,8 @@ class TestPyLayer(unittest.TestCase):
def
func_test_pylayer_Exception_bk
(
self
):
def
func_test_pylayer_Exception_bk
(
self
):
class
Layer_bk_none1
(
EagerPyLayer
if
in_dygraph_mode
()
else
PyLayer
):
class
Layer_bk_none1
(
EagerPyLayer
if
in_dygraph_mode
()
else
LegacyPyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
x
):
def
forward
(
ctx
,
x
):
...
@@ -322,7 +323,8 @@ class TestPyLayer(unittest.TestCase):
...
@@ -322,7 +323,8 @@ class TestPyLayer(unittest.TestCase):
with
self
.
assertRaises
(
ValueError
):
with
self
.
assertRaises
(
ValueError
):
z
.
sum
().
backward
()
z
.
sum
().
backward
()
class
Layer_bk_none2
(
EagerPyLayer
if
in_dygraph_mode
()
else
PyLayer
):
class
Layer_bk_none2
(
EagerPyLayer
if
in_dygraph_mode
()
else
LegacyPyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
x1
,
x2
):
def
forward
(
ctx
,
x1
,
x2
):
...
@@ -339,7 +341,8 @@ class TestPyLayer(unittest.TestCase):
...
@@ -339,7 +341,8 @@ class TestPyLayer(unittest.TestCase):
with
self
.
assertRaises
(
ValueError
):
with
self
.
assertRaises
(
ValueError
):
z
.
mean
().
backward
()
z
.
mean
().
backward
()
class
Layer_bk_one1
(
EagerPyLayer
if
in_dygraph_mode
()
else
PyLayer
):
class
Layer_bk_one1
(
EagerPyLayer
if
in_dygraph_mode
()
else
LegacyPyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
x
):
def
forward
(
ctx
,
x
):
...
@@ -356,7 +359,8 @@ class TestPyLayer(unittest.TestCase):
...
@@ -356,7 +359,8 @@ class TestPyLayer(unittest.TestCase):
with
self
.
assertRaises
(
ValueError
):
with
self
.
assertRaises
(
ValueError
):
z
.
mean
().
backward
()
z
.
mean
().
backward
()
class
Layer_bk_one2
(
EagerPyLayer
if
in_dygraph_mode
()
else
PyLayer
):
class
Layer_bk_one2
(
EagerPyLayer
if
in_dygraph_mode
()
else
LegacyPyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
x1
,
x2
):
def
forward
(
ctx
,
x1
,
x2
):
...
@@ -374,7 +378,7 @@ class TestPyLayer(unittest.TestCase):
...
@@ -374,7 +378,7 @@ class TestPyLayer(unittest.TestCase):
with
self
.
assertRaises
(
ValueError
):
with
self
.
assertRaises
(
ValueError
):
z
.
mean
().
backward
()
z
.
mean
().
backward
()
class
Layer_no_bk
(
EagerPyLayer
if
in_dygraph_mode
()
else
PyLayer
):
class
Layer_no_bk
(
EagerPyLayer
if
in_dygraph_mode
()
else
Legacy
PyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
x
):
def
forward
(
ctx
,
x
):
...
@@ -388,7 +392,8 @@ class TestPyLayer(unittest.TestCase):
...
@@ -388,7 +392,8 @@ class TestPyLayer(unittest.TestCase):
z
=
z
[
0
]
+
z
[
1
]
z
=
z
[
0
]
+
z
[
1
]
z
.
mean
().
backward
()
z
.
mean
().
backward
()
class
Layer_bk_match
(
EagerPyLayer
if
in_dygraph_mode
()
else
PyLayer
):
class
Layer_bk_match
(
EagerPyLayer
if
in_dygraph_mode
()
else
LegacyPyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
x
):
def
forward
(
ctx
,
x
):
...
@@ -412,7 +417,8 @@ class TestPyLayer(unittest.TestCase):
...
@@ -412,7 +417,8 @@ class TestPyLayer(unittest.TestCase):
def
func_test_pylayer_bk_return_none
(
self
):
def
func_test_pylayer_bk_return_none
(
self
):
class
Layer_bk_none1
(
EagerPyLayer
if
in_dygraph_mode
()
else
PyLayer
):
class
Layer_bk_none1
(
EagerPyLayer
if
in_dygraph_mode
()
else
LegacyPyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
x1
,
x2
):
def
forward
(
ctx
,
x1
,
x2
):
...
@@ -431,7 +437,8 @@ class TestPyLayer(unittest.TestCase):
...
@@ -431,7 +437,8 @@ class TestPyLayer(unittest.TestCase):
with
self
.
assertRaises
(
ValueError
):
with
self
.
assertRaises
(
ValueError
):
z
.
mean
().
backward
()
z
.
mean
().
backward
()
class
Layer_bk_none2
(
EagerPyLayer
if
in_dygraph_mode
()
else
PyLayer
):
class
Layer_bk_none2
(
EagerPyLayer
if
in_dygraph_mode
()
else
LegacyPyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
x1
,
x2
):
def
forward
(
ctx
,
x1
,
x2
):
...
@@ -457,7 +464,7 @@ class TestPyLayer(unittest.TestCase):
...
@@ -457,7 +464,7 @@ class TestPyLayer(unittest.TestCase):
def
func_test_pylayer_inplace
(
self
):
def
func_test_pylayer_inplace
(
self
):
class
cus_tanh
(
EagerPyLayer
if
in_dygraph_mode
()
else
PyLayer
):
class
cus_tanh
(
EagerPyLayer
if
in_dygraph_mode
()
else
Legacy
PyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
x
):
def
forward
(
ctx
,
x
):
...
@@ -494,7 +501,8 @@ class TestPyLayer(unittest.TestCase):
...
@@ -494,7 +501,8 @@ class TestPyLayer(unittest.TestCase):
def
test_pylayer_inplace_backward_error
(
self
):
def
test_pylayer_inplace_backward_error
(
self
):
with
_test_eager_guard
():
with
_test_eager_guard
():
class
cus_tanh
(
EagerPyLayer
if
in_dygraph_mode
()
else
PyLayer
):
class
cus_tanh
(
EagerPyLayer
if
in_dygraph_mode
()
else
LegacyPyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
x
):
def
forward
(
ctx
,
x
):
...
@@ -530,7 +538,8 @@ class TestPyLayer(unittest.TestCase):
...
@@ -530,7 +538,8 @@ class TestPyLayer(unittest.TestCase):
def
test_pylayer_inplace_backward_success_1
(
self
):
def
test_pylayer_inplace_backward_success_1
(
self
):
with
_test_eager_guard
():
with
_test_eager_guard
():
class
cus_tanh
(
EagerPyLayer
if
in_dygraph_mode
()
else
PyLayer
):
class
cus_tanh
(
EagerPyLayer
if
in_dygraph_mode
()
else
LegacyPyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
x
):
def
forward
(
ctx
,
x
):
...
@@ -564,7 +573,8 @@ class TestPyLayer(unittest.TestCase):
...
@@ -564,7 +573,8 @@ class TestPyLayer(unittest.TestCase):
def
test_pylayer_inplace_backward_success_2
(
self
):
def
test_pylayer_inplace_backward_success_2
(
self
):
with
_test_eager_guard
():
with
_test_eager_guard
():
class
cus_tanh
(
EagerPyLayer
if
in_dygraph_mode
()
else
PyLayer
):
class
cus_tanh
(
EagerPyLayer
if
in_dygraph_mode
()
else
LegacyPyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
x
):
def
forward
(
ctx
,
x
):
...
@@ -597,7 +607,8 @@ class TestPyLayer(unittest.TestCase):
...
@@ -597,7 +607,8 @@ class TestPyLayer(unittest.TestCase):
def
func_test_pylayer_inplace_and_leaf_exception
(
self
):
def
func_test_pylayer_inplace_and_leaf_exception
(
self
):
class
cus_pylayer_op
(
EagerPyLayer
if
in_dygraph_mode
()
else
PyLayer
):
class
cus_pylayer_op
(
EagerPyLayer
if
in_dygraph_mode
()
else
LegacyPyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
x
):
def
forward
(
ctx
,
x
):
...
@@ -633,7 +644,7 @@ class TestPyLayer(unittest.TestCase):
...
@@ -633,7 +644,7 @@ class TestPyLayer(unittest.TestCase):
def
func_test_backward_in_backward
(
self
):
def
func_test_backward_in_backward
(
self
):
class
cus_tanh
(
EagerPyLayer
if
in_dygraph_mode
()
else
PyLayer
):
class
cus_tanh
(
EagerPyLayer
if
in_dygraph_mode
()
else
Legacy
PyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
x
):
def
forward
(
ctx
,
x
):
...
@@ -665,7 +676,7 @@ class TestPyLayer(unittest.TestCase):
...
@@ -665,7 +676,7 @@ class TestPyLayer(unittest.TestCase):
def
func_test_return_to_tensor
(
self
):
def
func_test_return_to_tensor
(
self
):
class
Tanh
(
EagerPyLayer
if
in_dygraph_mode
()
else
PyLayer
):
class
Tanh
(
EagerPyLayer
if
in_dygraph_mode
()
else
Legacy
PyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
x1
):
def
forward
(
ctx
,
x1
):
...
@@ -779,7 +790,7 @@ class TestPyLayerReturnType(unittest.TestCase):
...
@@ -779,7 +790,7 @@ class TestPyLayerReturnType(unittest.TestCase):
def
test_forward_args_fake_tensor
(
self
):
def
test_forward_args_fake_tensor
(
self
):
class
Tanh
(
PyLayer
):
class
Tanh
(
Legacy
PyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
x1
):
def
forward
(
ctx
,
x1
):
...
@@ -797,7 +808,7 @@ class TestPyLayerReturnType(unittest.TestCase):
...
@@ -797,7 +808,7 @@ class TestPyLayerReturnType(unittest.TestCase):
def
test_forward_kwargs_fake_tensor
(
self
):
def
test_forward_kwargs_fake_tensor
(
self
):
class
Tanh
(
PyLayer
):
class
Tanh
(
Legacy
PyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
x1
):
def
forward
(
ctx
,
x1
):
...
@@ -815,7 +826,7 @@ class TestPyLayerReturnType(unittest.TestCase):
...
@@ -815,7 +826,7 @@ class TestPyLayerReturnType(unittest.TestCase):
def
test_forward_return_fake_tensor
(
self
):
def
test_forward_return_fake_tensor
(
self
):
class
Tanh
(
PyLayer
):
class
Tanh
(
Legacy
PyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
x1
):
def
forward
(
ctx
,
x1
):
...
@@ -833,7 +844,7 @@ class TestPyLayerReturnType(unittest.TestCase):
...
@@ -833,7 +844,7 @@ class TestPyLayerReturnType(unittest.TestCase):
def
test_forward_return_fake_tensor_tuple
(
self
):
def
test_forward_return_fake_tensor_tuple
(
self
):
class
Tanh
(
PyLayer
):
class
Tanh
(
Legacy
PyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
x1
):
def
forward
(
ctx
,
x1
):
...
@@ -851,7 +862,7 @@ class TestPyLayerReturnType(unittest.TestCase):
...
@@ -851,7 +862,7 @@ class TestPyLayerReturnType(unittest.TestCase):
def
test_backward_return_fake_tensor_tuple
(
self
):
def
test_backward_return_fake_tensor_tuple
(
self
):
class
Tanh
(
PyLayer
):
class
Tanh
(
Legacy
PyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
x1
,
x2
):
def
forward
(
ctx
,
x1
,
x2
):
...
@@ -871,7 +882,7 @@ class TestPyLayerReturnType(unittest.TestCase):
...
@@ -871,7 +882,7 @@ class TestPyLayerReturnType(unittest.TestCase):
def
test_backward_return_fake_tensor
(
self
):
def
test_backward_return_fake_tensor
(
self
):
class
Tanh
(
PyLayer
):
class
Tanh
(
Legacy
PyLayer
):
@
staticmethod
@
staticmethod
def
forward
(
ctx
,
x1
):
def
forward
(
ctx
,
x1
):
...
...
python/paddle/incubate/distributed/models/moe/moe_layer.py
浏览文件 @
a5dc0a79
...
@@ -31,7 +31,7 @@ from paddle.distributed import alltoall, all_gather
...
@@ -31,7 +31,7 @@ from paddle.distributed import alltoall, all_gather
from
paddle.distributed.fleet.meta_parallel
import
get_rng_state_tracker
from
paddle.distributed.fleet.meta_parallel
import
get_rng_state_tracker
from
paddle.distributed
import
fleet
from
paddle.distributed
import
fleet
from
paddle.autograd
import
PyLayer
,
EagerPyLayer
from
paddle.autograd
import
PyLayer
from
.gate
import
NaiveGate
,
GShardGate
,
SwitchGate
,
BaseGate
from
.gate
import
NaiveGate
,
GShardGate
,
SwitchGate
,
BaseGate
from
.utils
import
count_by_gate
from
.utils
import
count_by_gate
from
paddle.distributed.fleet.meta_parallel.pp_utils.utils
import
_hp_recompute
from
paddle.distributed.fleet.meta_parallel.pp_utils.utils
import
_hp_recompute
...
@@ -132,53 +132,6 @@ class MoEScatter(PyLayer):
...
@@ -132,53 +132,6 @@ class MoEScatter(PyLayer):
return
grad_in
,
None
,
None
,
None
return
grad_in
,
None
,
None
,
None
class
EagerMoEScatter
(
EagerPyLayer
):
r
"""
Scatter input samples from [batch x sequences] to contiguous alone experts.
If `world_size` is greater than 1, the samples will first be locally
scattered, and then exchanged across workers.
"""
@
staticmethod
def
forward
(
ctx
,
inp
,
pos
,
local_expert_count
,
global_expert_count
,
fwd_batch_size
,
world_size
,
group
=
None
):
local_input_buf
=
_local_scatter
(
inp
,
pos
)
if
world_size
>
1
:
global_input_buf
=
global_scatter
(
local_input_buf
,
local_expert_count
,
global_expert_count
,
group
=
group
)
else
:
global_input_buf
=
local_input_buf
ctx
.
moe_args
=
inp
.
shape
[
0
],
world_size
,
group
variables
=
(
pos
,
local_expert_count
,
global_expert_count
)
ctx
.
save_for_backward
(
*
variables
)
return
global_input_buf
@
staticmethod
def
backward
(
ctx
,
grad
):
(
pos
,
local_expert_count
,
global_expert_count
)
=
ctx
.
saved_tensor
()
(
inp_batch_size
,
world_size
,
group
)
=
ctx
.
moe_args
if
world_size
>
1
:
local_grad_in
=
global_gather
(
grad
,
local_expert_count
,
global_expert_count
,
group
=
group
)
else
:
local_grad_in
=
grad
grad_in
=
_local_gather
(
local_grad_in
,
pos
,
inp_batch_size
)
return
grad_in
,
None
,
None
,
None
class
MoEGather
(
PyLayer
):
class
MoEGather
(
PyLayer
):
r
"""
r
"""
Gather output samples from contiguous alone experts back to [batch x
Gather output samples from contiguous alone experts back to [batch x
...
@@ -226,53 +179,6 @@ class MoEGather(PyLayer):
...
@@ -226,53 +179,6 @@ class MoEGather(PyLayer):
return
global_grad_out_buf
,
None
,
None
,
None
return
global_grad_out_buf
,
None
,
None
,
None
class
EagerMoEGather
(
EagerPyLayer
):
r
"""
Gather output samples from contiguous alone experts back to [batch x
sequences]. Works symmetrically with MoEScatter.
"""
@
staticmethod
def
forward
(
ctx
,
global_output_buf
,
pos
,
local_expert_count
,
global_expert_count
,
local_batch_size
,
world_size
,
group
=
None
):
if
world_size
>
1
:
local_output_buf
=
global_gather
(
global_output_buf
,
local_expert_count
,
global_expert_count
,
group
=
group
)
else
:
local_output_buf
=
global_output_buf
output
=
_local_gather
(
local_output_buf
,
pos
,
local_batch_size
,
maybe_overlap
=
False
)
ctx
.
moe_args
=
(
global_output_buf
.
shape
[
0
],
world_size
,
group
)
variables
=
(
pos
,
local_expert_count
,
global_expert_count
)
ctx
.
save_for_backward
(
*
variables
)
return
output
@
staticmethod
def
backward
(
ctx
,
grad_out
):
pos
,
local_expert_count
,
global_expert_count
=
ctx
.
saved_tensor
()
fwd_batch_size
,
world_size
,
group
=
ctx
.
moe_args
grad_out_buf
=
_local_scatter
(
grad_out
,
pos
)
if
world_size
>
1
:
global_grad_out_buf
=
global_scatter
(
grad_out_buf
,
local_expert_count
,
global_expert_count
,
group
=
group
)
else
:
global_grad_out_buf
=
grad_out_buf
return
global_grad_out_buf
,
None
,
None
,
None
class
AllGather
(
PyLayer
):
class
AllGather
(
PyLayer
):
r
"""
r
"""
A wrapper for the All-Gather function to support auto-differentiation.
A wrapper for the All-Gather function to support auto-differentiation.
...
@@ -295,28 +201,6 @@ class AllGather(PyLayer):
...
@@ -295,28 +201,6 @@ class AllGather(PyLayer):
ends
=
[(
rank
+
1
)
*
dim0
])
ends
=
[(
rank
+
1
)
*
dim0
])
class
EagerAllGather
(
EagerPyLayer
):
r
"""
A wrapper for the All-Gather function to support auto-differentiation.
"""
@
staticmethod
def
forward
(
ctx
,
inp
,
rank
,
world_size
,
group
):
tensor_list
=
[]
paddle
.
distributed
.
all_gather
(
tensor_list
,
inp
,
group
=
group
)
output
=
paddle
.
concat
(
tensor_list
,
axis
=
0
)
ctx
.
args
=
rank
,
inp
.
shape
[
0
]
return
output
@
staticmethod
def
backward
(
ctx
,
grad_out
):
rank
,
dim0
=
ctx
.
args
return
paddle
.
slice
(
grad_out
,
axes
=
[
0
],
starts
=
[
rank
*
dim0
],
ends
=
[(
rank
+
1
)
*
dim0
])
class
Slice
(
PyLayer
):
class
Slice
(
PyLayer
):
r
"""
r
"""
A wrapper for the Slice function to support auto-differentiation.
A wrapper for the Slice function to support auto-differentiation.
...
@@ -341,30 +225,6 @@ class Slice(PyLayer):
...
@@ -341,30 +225,6 @@ class Slice(PyLayer):
return
_all_gather
(
grad_out
,
group
=
group
)
return
_all_gather
(
grad_out
,
group
=
group
)
class
EagerSlice
(
EagerPyLayer
):
r
"""
A wrapper for the Slice function to support auto-differentiation.
"""
@
staticmethod
def
forward
(
ctx
,
inp
,
rank
,
world_size
,
group
):
B
=
inp
.
shape
[
0
]
local_batch_size
=
B
//
world_size
batch_start
=
local_batch_size
*
rank
batch_end
=
min
(
batch_start
+
local_batch_size
,
B
)
inp
=
paddle
.
slice
(
inp
,
axes
=
[
0
],
starts
=
[
batch_start
],
ends
=
[
batch_end
])
ctx
.
args
=
world_size
,
group
return
inp
@
staticmethod
def
backward
(
ctx
,
grad_out
):
world_size
,
group
=
ctx
.
args
return
_all_gather
(
grad_out
,
group
=
group
)
def
prepare_forward
(
gate
,
num_expert
,
world_size
,
moe_group
):
def
prepare_forward
(
gate
,
num_expert
,
world_size
,
moe_group
):
pos
,
local_expert_count
,
global_expert_count
=
count_by_gate
(
pos
,
local_expert_count
,
global_expert_count
=
count_by_gate
(
gate
,
num_expert
,
world_size
,
group
=
moe_group
)
gate
,
num_expert
,
world_size
,
group
=
moe_group
)
...
@@ -517,10 +377,7 @@ class MoELayer(nn.Layer):
...
@@ -517,10 +377,7 @@ class MoELayer(nn.Layer):
mp_rank
=
self
.
mp_group
.
rank
mp_rank
=
self
.
mp_group
.
rank
mp_size
=
self
.
mp_group
.
nranks
mp_size
=
self
.
mp_group
.
nranks
if
mp_size
>
1
:
if
mp_size
>
1
:
if
in_dygraph_mode
():
inp
=
Slice
.
apply
(
inp
,
mp_rank
,
mp_size
,
self
.
mp_group
)
inp
=
EagerSlice
.
apply
(
inp
,
mp_rank
,
mp_size
,
self
.
mp_group
)
else
:
inp
=
Slice
.
apply
(
inp
,
mp_rank
,
mp_size
,
self
.
mp_group
)
value
,
gate
=
self
.
gate
(
inp
)
value
,
gate
=
self
.
gate
(
inp
)
(
(
...
@@ -541,14 +398,9 @@ class MoELayer(nn.Layer):
...
@@ -541,14 +398,9 @@ class MoELayer(nn.Layer):
temp_pos
=
pos
temp_pos
=
pos
assert
topk
==
self
.
top_k
assert
topk
==
self
.
top_k
if
in_dygraph_mode
():
x
=
MoEScatter
.
apply
(
inp
,
temp_pos
,
local_expert_count
,
x
=
EagerMoEScatter
.
apply
(
inp
,
temp_pos
,
local_expert_count
,
global_expert_count
,
fwd_batch_size
,
global_expert_count
,
fwd_batch_size
,
self
.
world_size
,
self
.
group
)
self
.
world_size
,
self
.
group
)
else
:
x
=
MoEScatter
.
apply
(
inp
,
temp_pos
,
local_expert_count
,
global_expert_count
,
fwd_batch_size
,
self
.
world_size
,
self
.
group
)
d_model
=
self
.
d_model
d_model
=
self
.
d_model
...
@@ -577,23 +429,15 @@ class MoELayer(nn.Layer):
...
@@ -577,23 +429,15 @@ class MoELayer(nn.Layer):
if
len
(
gate
.
shape
)
==
2
:
if
len
(
gate
.
shape
)
==
2
:
out_batch_size
*=
gate
.
shape
[
1
]
out_batch_size
*=
gate
.
shape
[
1
]
if
in_dygraph_mode
():
x
=
MoEGather
.
apply
(
x
,
pos
,
local_expert_count
,
global_expert_count
,
x
=
EagerMoEGather
.
apply
(
x
,
pos
,
local_expert_count
,
out_batch_size
,
self
.
world_size
,
self
.
group
)
global_expert_count
,
out_batch_size
,
self
.
world_size
,
self
.
group
)
else
:
x
=
MoEGather
.
apply
(
x
,
pos
,
local_expert_count
,
global_expert_count
,
out_batch_size
,
self
.
world_size
,
self
.
group
)
x
=
x
.
reshape
([
-
1
,
self
.
top_k
,
d_model
])
x
=
x
.
reshape
([
-
1
,
self
.
top_k
,
d_model
])
value
=
value
.
reshape
([
x
.
shape
[
0
],
1
,
self
.
top_k
])
value
=
value
.
reshape
([
x
.
shape
[
0
],
1
,
self
.
top_k
])
x
=
paddle
.
bmm
(
value
,
x
).
reshape
([
-
1
,
d_model
])
x
=
paddle
.
bmm
(
value
,
x
).
reshape
([
-
1
,
d_model
])
if
mp_size
>
1
:
if
mp_size
>
1
:
if
in_dygraph_mode
():
x
=
AllGather
.
apply
(
x
,
mp_rank
,
mp_size
,
self
.
mp_group
)
x
=
EagerAllGather
.
apply
(
x
,
mp_rank
,
mp_size
,
self
.
mp_group
)
else
:
x
=
AllGather
.
apply
(
x
,
mp_rank
,
mp_size
,
self
.
mp_group
)
x
=
paddle
.
reshape_
(
x
,
origin_shape
)
x
=
paddle
.
reshape_
(
x
,
origin_shape
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录