未验证 提交 a5dc0a79 编写于 作者: W wanghuancoder 提交者: GitHub

[Eager] Rename EagerPyLayer to PyLayer (#43696)

* rename eagerpylayer
上级 8a122ecc
...@@ -129,16 +129,19 @@ PyObject* pylayer_method_apply(PyObject* cls, ...@@ -129,16 +129,19 @@ PyObject* pylayer_method_apply(PyObject* cls,
bool require_any_grad = false; bool require_any_grad = false;
size_t inputs_size = 0; size_t inputs_size = 0;
size_t args_size = 0;
size_t kwargs_size = 0;
PyObject* forward_args = nullptr; PyObject* forward_args = nullptr;
PyObject* kwargs_value_list = nullptr; PyObject* kwargs_value_list = nullptr;
if (kwargs) { if (kwargs) {
inputs_size = PyDict_Size(kwargs); kwargs_size = PyDict_Size(kwargs);
kwargs_value_list = PyDict_Values(kwargs); kwargs_value_list = PyDict_Values(kwargs);
forward_args = PyTuple_New(1);
} else {
inputs_size = PyTuple_GET_SIZE(args);
forward_args = PyTuple_New(inputs_size + 1);
} }
if (args) {
args_size = PyTuple_GET_SIZE(args);
}
inputs_size = kwargs_size + args_size;
forward_args = PyTuple_New(args_size + 1);
Py_INCREF(ctx); Py_INCREF(ctx);
PyTuple_SET_ITEM(forward_args, 0, reinterpret_cast<PyObject*>(ctx)); PyTuple_SET_ITEM(forward_args, 0, reinterpret_cast<PyObject*>(ctx));
...@@ -150,8 +153,8 @@ PyObject* pylayer_method_apply(PyObject* cls, ...@@ -150,8 +153,8 @@ PyObject* pylayer_method_apply(PyObject* cls,
ctx->forward_input_tensor_is_duplicable.reserve(inputs_size); ctx->forward_input_tensor_is_duplicable.reserve(inputs_size);
for (size_t i = 0; i < inputs_size; i++) { for (size_t i = 0; i < inputs_size; i++) {
PyObject* obj = nullptr; PyObject* obj = nullptr;
if (kwargs) { if (i >= args_size) {
obj = PyList_GetItem(kwargs_value_list, i); obj = PyList_GetItem(kwargs_value_list, i - args_size);
} else { } else {
obj = PyTuple_GET_ITEM(args, i); obj = PyTuple_GET_ITEM(args, i);
} }
...@@ -212,7 +215,7 @@ PyObject* pylayer_method_apply(PyObject* cls, ...@@ -212,7 +215,7 @@ PyObject* pylayer_method_apply(PyObject* cls,
} }
} }
if (!kwargs) { if (i < args_size) {
Py_INCREF(obj); Py_INCREF(obj);
PyTuple_SET_ITEM(forward_args, i + 1, obj); PyTuple_SET_ITEM(forward_args, i + 1, obj);
} }
......
...@@ -17,7 +17,13 @@ from ..fluid.dygraph.base import no_grad_ as no_grad # noqa: F401 ...@@ -17,7 +17,13 @@ from ..fluid.dygraph.base import no_grad_ as no_grad # noqa: F401
from ..framework import is_grad_enabled, set_grad_enabled # noqa: F401 from ..framework import is_grad_enabled, set_grad_enabled # noqa: F401
from . import backward_mode # noqa: F401 from . import backward_mode # noqa: F401
from .backward_mode import backward # noqa: F401 from .backward_mode import backward # noqa: F401
from .py_layer import PyLayer, PyLayerContext, EagerPyLayer, EagerPyLayerContext # noqa: F401 from ..fluid.framework import _in_eager_mode_
if _in_eager_mode_:
from .py_layer import EagerPyLayer as PyLayer # noqa: F401
from .py_layer import EagerPyLayerContext as PyLayerContext # noqa: F401
else:
from .py_layer import LegacyPyLayer as PyLayer # noqa: F401
from .py_layer import LegacyPyLayerContext as PyLayerContext # noqa: F401
from ..framework import set_grad_enabled, is_grad_enabled # noqa: F401 from ..framework import set_grad_enabled, is_grad_enabled # noqa: F401
from ..fluid.dygraph.base import no_grad_ as no_grad # noqa: F401 from ..fluid.dygraph.base import no_grad_ as no_grad # noqa: F401
from .functional import vjp, jvp, Jacobian, Hessian # noqa: F401 from .functional import vjp, jvp, Jacobian, Hessian # noqa: F401
......
...@@ -21,7 +21,7 @@ from paddle.fluid import core ...@@ -21,7 +21,7 @@ from paddle.fluid import core
__all__ = [] __all__ = []
class PyLayerContext(object): class LegacyPyLayerContext(object):
""" """
The object of this class is a context that is used in PyLayer to enhance the function. The object of this class is a context that is used in PyLayer to enhance the function.
...@@ -181,7 +181,7 @@ class CPyLayer(object): ...@@ -181,7 +181,7 @@ class CPyLayer(object):
return core.pylayer_apply(place, cls, *args, **kwargs) return core.pylayer_apply(place, cls, *args, **kwargs)
class PyLayerBackward(PyLayerContext): class PyLayerBackward(LegacyPyLayerContext):
def backward(self, *args, **kwargs): def backward(self, *args, **kwargs):
with paddle.fluid.dygraph.guard(): with paddle.fluid.dygraph.guard():
...@@ -205,7 +205,7 @@ class LayerMeta(type): ...@@ -205,7 +205,7 @@ class LayerMeta(type):
return super(LayerMeta, cls).__init__(name, bases, attrs) return super(LayerMeta, cls).__init__(name, bases, attrs)
class PyLayer(with_mateclass(LayerMeta, CPyLayer)): class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
""" """
Build a custom `Layer` by creating subclasses. Subclasses need to follow the following rules: Build a custom `Layer` by creating subclasses. Subclasses need to follow the following rules:
1. Subclasses contain `forward` and `backward` function. Both forward and backward are @staticmethod. 1. Subclasses contain `forward` and `backward` function. Both forward and backward are @staticmethod.
...@@ -425,6 +425,8 @@ class EagerPyLayerContext(object): ...@@ -425,6 +425,8 @@ class EagerPyLayerContext(object):
Examples: Examples:
.. code-block:: python .. code-block:: python
import os
os.environ['FLAGS_enable_eager_mode'] = '1'
import paddle import paddle
from paddle.autograd import PyLayer from paddle.autograd import PyLayer
import numpy as np import numpy as np
...@@ -464,6 +466,8 @@ class EagerPyLayerContext(object): ...@@ -464,6 +466,8 @@ class EagerPyLayerContext(object):
Examples: Examples:
.. code-block:: python .. code-block:: python
import os
os.environ['FLAGS_enable_eager_mode'] = '1'
import paddle import paddle
from paddle.autograd import PyLayer from paddle.autograd import PyLayer
import numpy as np import numpy as np
......
...@@ -1181,9 +1181,9 @@ def _mp_allreduce(tensor, ...@@ -1181,9 +1181,9 @@ def _mp_allreduce(tensor,
if in_dygraph_mode(): if in_dygraph_mode():
assert op == ReduceOp.SUM, "Unknown parameter: {}.".format(op) assert op == ReduceOp.SUM, "Unknown parameter: {}.".format(op)
from paddle.autograd import EagerPyLayer from paddle.autograd import PyLayer
class mp_allreduce_eager(EagerPyLayer): class mp_allreduce_eager(PyLayer):
@staticmethod @staticmethod
def forward(ctx, tensor, use_calc_stream, ring_id, def forward(ctx, tensor, use_calc_stream, ring_id,
......
...@@ -37,7 +37,7 @@ from ..meta_optimizers import HybridParallelOptimizer, HeterParallelOptimizer ...@@ -37,7 +37,7 @@ from ..meta_optimizers import HybridParallelOptimizer, HeterParallelOptimizer
from paddle import _C_ops from paddle import _C_ops
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.dygraph import to_variable from paddle.fluid.dygraph import to_variable
from paddle.distributed.fleet.utils.recompute import RecomputeFunction from paddle.distributed.fleet.utils.recompute import LegacyRecomputeFunction
from paddle.fluid.dygraph.varbase_patch_methods import _grad_scalar from paddle.fluid.dygraph.varbase_patch_methods import _grad_scalar
__all__ = [] __all__ = []
...@@ -68,7 +68,8 @@ class _RecomputeModelWrapper(paddle.nn.Layer): ...@@ -68,7 +68,8 @@ class _RecomputeModelWrapper(paddle.nn.Layer):
return do_run return do_run
def _checkpoint(self, func, *args, **kwargs): def _checkpoint(self, func, *args, **kwargs):
return RecomputeFunction.apply(func, self._preserve_rng_state, *args) return LegacyRecomputeFunction.apply(func, self._preserve_rng_state,
*args)
def forward(self, input): def forward(self, input):
end = 0 end = 0
......
...@@ -17,7 +17,7 @@ import contextlib ...@@ -17,7 +17,7 @@ import contextlib
import paddle import paddle
from paddle.fluid import core from paddle.fluid import core
from paddle import _C_ops from paddle import _C_ops
from paddle.autograd import PyLayer, EagerPyLayer from paddle.autograd import PyLayer
from paddle.fluid import framework from paddle.fluid import framework
from ...utils.recompute import check_recompute_necessary, detach_variable, swith_rng_state_tracker from ...utils.recompute import check_recompute_necessary, detach_variable, swith_rng_state_tracker
from ..parallel_layers.random import get_rng_state_tracker from ..parallel_layers.random import get_rng_state_tracker
...@@ -151,7 +151,7 @@ def _merge_activation(tensor): ...@@ -151,7 +151,7 @@ def _merge_activation(tensor):
return _all_gather(tensor, group=mp_group) return _all_gather(tensor, group=mp_group)
class _HPEagerRecomputeFunction(EagerPyLayer): class _HPRecomputeFunction(PyLayer):
""" """
Compared with paddle.distributed.fleet.utils.recompute, there are the following differences: Compared with paddle.distributed.fleet.utils.recompute, there are the following differences:
1. In order to support PipeLineParallel, the input of recompute is modified to ensure that the input can be tuple type. 1. In order to support PipeLineParallel, the input of recompute is modified to ensure that the input can be tuple type.
...@@ -256,7 +256,7 @@ class _HPEagerRecomputeFunction(EagerPyLayer): ...@@ -256,7 +256,7 @@ class _HPEagerRecomputeFunction(EagerPyLayer):
detached_inputs = detach_variable(tuple(inputs)) detached_inputs = detach_variable(tuple(inputs))
outputs = ctx.run_function(*detached_inputs) outputs = ctx.run_function(*detached_inputs)
if isinstance(outputs, core.eager.Tensor): if isinstance(outputs, (core.VarBase, core.eager.Tensor)):
outputs = (outputs, ) outputs = (outputs, )
assert len(outputs) == len(args) assert len(outputs) == len(args)
...@@ -266,137 +266,8 @@ class _HPEagerRecomputeFunction(EagerPyLayer): ...@@ -266,137 +266,8 @@ class _HPEagerRecomputeFunction(EagerPyLayer):
for i in range(len(outputs)): for i in range(len(outputs)):
if isinstance( if isinstance(
outputs[i], outputs[i],
core.eager.Tensor) and not outputs[i].stop_gradient: (core.VarBase,
forward_outputs_with_grad.append(outputs[i]) core.eager.Tensor)) and not outputs[i].stop_gradient:
backward_inputs.append(args[i])
if len(forward_outputs_with_grad) == 0:
raise RuntimeError(
"none of output has stop_gradient=False, this recompute() is not necessary"
)
# actually backward
paddle.autograd.backward(forward_outputs_with_grad, backward_inputs)
grads = tuple(inp._grad_ivar() for inp in detached_inputs
if isinstance(inp, core.eager.Tensor))
return grads
class _HPRecomputeFunction(PyLayer):
"""
Compared with paddle.distributed.fleet.utils.recompute, there are the following differences:
1. In order to support PipeLineParallel, the input of recompute is modified to ensure that the input can be tuple type.
2. Offload support for activation
3. Support MP segmentation of activation to further reduce cuda memory
4. Adapt to the random state of MP
"""
@staticmethod
def forward(ctx, run_function, all_outputs, *args):
check_recompute_necessary(args)
# store for recomputing
ctx.run_function = run_function
# store the rng states
ctx.fwd_cuda_rng_state = paddle.get_cuda_rng_state()
ctx.fwd_cuda_rng_state_tracker = get_rng_state_tracker(
).get_states_tracker()
# save input for backward
ctx.inputs = []
ctx.tensor_indices = []
ctx.tensor_shapes = []
tensor_inputs = []
cur_device = paddle.get_device()
assert 'gpu:' in paddle.get_device(
), "Recompute with RNG is not support current device: {}.".format(
cur_device)
# TODO support AMP
tracer = framework._dygraph_tracer()
ctx.is_fw_autocast = False if tracer._amp_level == core.AmpLevel.O0 else True
if tracer._amp_level == core.AmpLevel.O2:
ctx.amp_level = 'O2'
elif tracer._amp_level in (core.AmpLevel.O1, core.AmpLevel.O0):
ctx.amp_level = 'O1'
else:
raise ValueError("unsupported amp level: {}".format(
tracer._amp_level))
ctx.amp_white_list, ctx.amp_black_list = tracer._get_amp_op_list()
with paddle.no_grad():
outputs = run_function(*args)
for i, arg in enumerate(args):
if paddle.is_tensor(arg):
state = arg.stop_gradient
if _recompute_partition:
ctx.tensor_shapes.append(arg.shape)
partition = _split_activation(arg.detach()).clone()
# TODO(shenliang03) not use calculate stream to D2H to speed
arg = partition.cpu() if _recompute_offload else partition
else:
arg = arg.cpu() if _recompute_offload else arg
arg.stop_gradient = state
tensor_inputs.append(arg)
ctx.tensor_indices.append(i)
ctx.inputs.append(None)
else:
ctx.inputs.append(arg)
ctx.save_for_backward(*tensor_inputs)
if paddle.is_tensor(outputs):
all_outputs += [outputs]
return outputs
else:
all_outputs += outputs
return tuple(outputs)
@staticmethod
def backward(ctx, *args):
with paddle.fluid.dygraph.guard():
# Restore inputs
inputs = list(ctx.inputs)
tensor_indices = ctx.tensor_indices
tensor_shapes = ctx.tensor_shapes
tensors = list(ctx.saved_tensor())
device_id = paddle.distributed.ParallelEnv().device_id
for i, idx in enumerate(tensor_indices):
if _recompute_partition:
state = tensors[i].stop_gradient
tensors[i] = _merge_activation(
tensors[i]).detach().reshape_(tensor_shapes[i])
tensors[i].stop_gradient = state
inputs[idx] = tensors[i].cuda(
device_id) if _recompute_offload else tensors[i]
tracer = framework._dygraph_tracer()
tracer._has_grad = True
# need restore auto_cast state as well as w/b list
with swith_rng_state_tracker(ctx.fwd_cuda_rng_state,
ctx.fwd_cuda_rng_state_tracker):
with paddle.amp.auto_cast(enable=ctx.is_fw_autocast,
custom_white_list=ctx.amp_white_list,
custom_black_list=ctx.amp_black_list,
level=ctx.amp_level):
detached_inputs = detach_variable(tuple(inputs))
outputs = ctx.run_function(*detached_inputs)
if isinstance(outputs, core.VarBase):
outputs = (outputs, )
assert len(outputs) == len(args)
forward_outputs_with_grad = []
backward_inputs = []
for i in range(len(outputs)):
if isinstance(outputs[i],
core.VarBase) and not outputs[i].stop_gradient:
forward_outputs_with_grad.append(outputs[i]) forward_outputs_with_grad.append(outputs[i])
backward_inputs.append(args[i]) backward_inputs.append(args[i])
...@@ -408,7 +279,7 @@ class _HPRecomputeFunction(PyLayer): ...@@ -408,7 +279,7 @@ class _HPRecomputeFunction(PyLayer):
# actually backward # actually backward
paddle.autograd.backward(forward_outputs_with_grad, backward_inputs) paddle.autograd.backward(forward_outputs_with_grad, backward_inputs)
grads = tuple(inp._grad_ivar() for inp in detached_inputs grads = tuple(inp._grad_ivar() for inp in detached_inputs
if isinstance(inp, core.VarBase)) if isinstance(inp, (core.VarBase, core.eager.Tensor)))
return grads return grads
...@@ -420,10 +291,7 @@ def _hp_recompute(function, *args): ...@@ -420,10 +291,7 @@ def _hp_recompute(function, *args):
# 3. Here, we only use float dtype to distinguish whether a gradient is needed in output tensor # 3. Here, we only use float dtype to distinguish whether a gradient is needed in output tensor
all_outputs = [] all_outputs = []
if in_dygraph_mode(): _HPRecomputeFunction.apply(function, all_outputs, *args)
_HPEagerRecomputeFunction.apply(function, all_outputs, *args)
else:
_HPRecomputeFunction.apply(function, all_outputs, *args)
if len(all_outputs) == 1: if len(all_outputs) == 1:
return all_outputs[0] return all_outputs[0]
......
...@@ -20,7 +20,7 @@ from collections import OrderedDict ...@@ -20,7 +20,7 @@ from collections import OrderedDict
import paddle import paddle
from paddle import nn from paddle import nn
from paddle.autograd import EagerPyLayer from paddle.autograd import PyLayer
import paddle.fluid.core as core import paddle.fluid.core as core
import paddle.fluid.framework as framework import paddle.fluid.framework as framework
from paddle.fluid.framework import EagerParamBase from paddle.fluid.framework import EagerParamBase
...@@ -398,7 +398,7 @@ class GroupShardedStage3(nn.Layer): ...@@ -398,7 +398,7 @@ class GroupShardedStage3(nn.Layer):
def _register_forward_hooks(self, layer): def _register_forward_hooks(self, layer):
""" """
Register EagerPyLayer to manage memory slices. Register PyLayer to manage memory slices.
There are four stages: There are four stages:
FW FW
1. Before the forward layers, synchronize the full parameters. 1. Before the forward layers, synchronize the full parameters.
...@@ -653,7 +653,7 @@ def ForwardPreHooks(layer, order_tracer, trainable_params, param2buffer_size, ...@@ -653,7 +653,7 @@ def ForwardPreHooks(layer, order_tracer, trainable_params, param2buffer_size,
return return
class ForwardPostHooks(EagerPyLayer): class ForwardPostHooks(PyLayer):
@staticmethod @staticmethod
def forward(ctx, inputs, layer, order_tracer, trainable_params, def forward(ctx, inputs, layer, order_tracer, trainable_params,
......
...@@ -14,7 +14,8 @@ ...@@ -14,7 +14,8 @@
import paddle import paddle
from paddle.fluid import core from paddle.fluid import core
from paddle.autograd import PyLayer, EagerPyLayer from paddle.autograd import PyLayer
from paddle.autograd.py_layer import LegacyPyLayer
from paddle.fluid import framework from paddle.fluid import framework
import contextlib import contextlib
...@@ -68,7 +69,7 @@ def swith_rng_state_tracker(rng_state, tracker): ...@@ -68,7 +69,7 @@ def swith_rng_state_tracker(rng_state, tracker):
get_rng_state_tracker().set_states_tracker(orig_cuda_rng_tracker) get_rng_state_tracker().set_states_tracker(orig_cuda_rng_tracker)
class EagerRecomputeFunction(EagerPyLayer): class LegacyRecomputeFunction(LegacyPyLayer):
@staticmethod @staticmethod
def forward(ctx, run_function, preserve_rng_state, *args): def forward(ctx, run_function, preserve_rng_state, *args):
...@@ -171,7 +172,7 @@ class EagerRecomputeFunction(EagerPyLayer): ...@@ -171,7 +172,7 @@ class EagerRecomputeFunction(EagerPyLayer):
detached_inputs = detach_variable(tuple(inputs)) detached_inputs = detach_variable(tuple(inputs))
outputs = ctx.run_function(*detached_inputs) outputs = ctx.run_function(*detached_inputs)
if isinstance(outputs, core.eager.Tensor): if isinstance(outputs, core.VarBase):
outputs = (outputs, ) outputs = (outputs, )
assert len(outputs) == len(args) assert len(outputs) == len(args)
...@@ -183,9 +184,8 @@ class EagerRecomputeFunction(EagerPyLayer): ...@@ -183,9 +184,8 @@ class EagerRecomputeFunction(EagerPyLayer):
# the following backward_inputs_with_grad is used to avoid this case. # the following backward_inputs_with_grad is used to avoid this case.
backward_inputs_with_grad = [] backward_inputs_with_grad = []
for i in range(len(outputs)): for i in range(len(outputs)):
if isinstance( if isinstance(outputs[i],
outputs[i], core.VarBase) and not outputs[i].stop_gradient:
core.eager.Tensor) and not outputs[i].stop_gradient:
forward_outputs_with_grad.append(outputs[i]) forward_outputs_with_grad.append(outputs[i])
backward_inputs_with_grad.append(args[i]) backward_inputs_with_grad.append(args[i])
...@@ -199,8 +199,8 @@ class EagerRecomputeFunction(EagerPyLayer): ...@@ -199,8 +199,8 @@ class EagerRecomputeFunction(EagerPyLayer):
paddle.autograd.backward(forward_outputs_with_grad, paddle.autograd.backward(forward_outputs_with_grad,
backward_inputs_with_grad) backward_inputs_with_grad)
grads = tuple(inp.grad for inp in detached_inputs grads = list(inp._grad_ivar() for inp in detached_inputs
if isinstance(inp, core.eager.Tensor)) if isinstance(inp, core.VarBase))
return grads return grads
...@@ -307,7 +307,7 @@ class RecomputeFunction(PyLayer): ...@@ -307,7 +307,7 @@ class RecomputeFunction(PyLayer):
detached_inputs = detach_variable(tuple(inputs)) detached_inputs = detach_variable(tuple(inputs))
outputs = ctx.run_function(*detached_inputs) outputs = ctx.run_function(*detached_inputs)
if isinstance(outputs, core.VarBase): if isinstance(outputs, (core.VarBase, core.eager.Tensor)):
outputs = (outputs, ) outputs = (outputs, )
assert len(outputs) == len(args) assert len(outputs) == len(args)
...@@ -319,8 +319,10 @@ class RecomputeFunction(PyLayer): ...@@ -319,8 +319,10 @@ class RecomputeFunction(PyLayer):
# the following backward_inputs_with_grad is used to avoid this case. # the following backward_inputs_with_grad is used to avoid this case.
backward_inputs_with_grad = [] backward_inputs_with_grad = []
for i in range(len(outputs)): for i in range(len(outputs)):
if isinstance(outputs[i], if isinstance(
core.VarBase) and not outputs[i].stop_gradient: outputs[i],
(core.VarBase,
core.eager.Tensor)) and not outputs[i].stop_gradient:
forward_outputs_with_grad.append(outputs[i]) forward_outputs_with_grad.append(outputs[i])
backward_inputs_with_grad.append(args[i]) backward_inputs_with_grad.append(args[i])
...@@ -334,8 +336,14 @@ class RecomputeFunction(PyLayer): ...@@ -334,8 +336,14 @@ class RecomputeFunction(PyLayer):
paddle.autograd.backward(forward_outputs_with_grad, paddle.autograd.backward(forward_outputs_with_grad,
backward_inputs_with_grad) backward_inputs_with_grad)
grads = list(inp._grad_ivar() for inp in detached_inputs if in_dygraph_mode():
if isinstance(inp, core.VarBase)) grads = tuple(
inp._grad_ivar() for inp in detached_inputs
if isinstance(inp, (core.VarBase, core.eager.Tensor)))
else:
grads = list(
inp._grad_ivar() for inp in detached_inputs
if isinstance(inp, (core.VarBase, core.eager.Tensor)))
return grads return grads
...@@ -465,7 +473,4 @@ def recompute(function, *args, **kwargs): ...@@ -465,7 +473,4 @@ def recompute(function, *args, **kwargs):
if framework._dygraph_tracer()._has_grad: if framework._dygraph_tracer()._has_grad:
check_recompute_necessary(args) check_recompute_necessary(args)
if in_dygraph_mode(): return RecomputeFunction.apply(function, preserve, *args)
return EagerRecomputeFunction.apply(function, preserve, *args)
else:
return RecomputeFunction.apply(function, preserve, *args)
...@@ -60,7 +60,9 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_sharding_parallel) ...@@ -60,7 +60,9 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_sharding_parallel)
list(APPEND DIST_TEST_OPS test_dygraph_sharding_optimizer_stage2) list(APPEND DIST_TEST_OPS test_dygraph_sharding_optimizer_stage2)
list(APPEND DIST_TEST_OPS test_dygraph_sharding_stage2) list(APPEND DIST_TEST_OPS test_dygraph_sharding_stage2)
list(APPEND DIST_TEST_OPS test_dygraph_sharding_stage3) list(APPEND DIST_TEST_OPS test_dygraph_sharding_stage3)
list(APPEND DIST_TEST_OPS test_dygraph_sharding_stage3_for_eager)
list(APPEND DIST_TEST_OPS test_dygraph_group_sharded_api) list(APPEND DIST_TEST_OPS test_dygraph_group_sharded_api)
list(APPEND DIST_TEST_OPS test_dygraph_group_sharded_api_for_eager)
list(APPEND DIST_TEST_OPS test_auto_parallel_parallelizer) list(APPEND DIST_TEST_OPS test_auto_parallel_parallelizer)
list(APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers) list(APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers)
list(APPEND DIST_TEST_OPS test_hybrid_parallel_inference_helper) list(APPEND DIST_TEST_OPS test_hybrid_parallel_inference_helper)
...@@ -305,13 +307,17 @@ if((NOT WITH_GPU) AND (NOT WITH_ROCM)) ...@@ -305,13 +307,17 @@ if((NOT WITH_GPU) AND (NOT WITH_ROCM))
list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_optimizer_stage2) list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_optimizer_stage2)
list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage2) list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage2)
list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage3) list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage3)
list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage3_for_eager)
list(REMOVE_ITEM TEST_OPS test_dygraph_group_sharded_api) list(REMOVE_ITEM TEST_OPS test_dygraph_group_sharded_api)
list(REMOVE_ITEM TEST_OPS test_dygraph_group_sharded_api_for_eager)
list(REMOVE_ITEM TEST_OPS test_auto_parallel_parallelizer) list(REMOVE_ITEM TEST_OPS test_auto_parallel_parallelizer)
list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mp_layers) list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mp_layers)
list(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision) list(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision)
list(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision_for_eager)
list(REMOVE_ITEM TEST_OPS test_mixed_precision) list(REMOVE_ITEM TEST_OPS test_mixed_precision)
list(REMOVE_ITEM TEST_OPS test_fleet_base_single) list(REMOVE_ITEM TEST_OPS test_fleet_base_single)
list(REMOVE_ITEM TEST_OPS test_dygraph_recompute) list(REMOVE_ITEM TEST_OPS test_dygraph_recompute)
list(REMOVE_ITEM TEST_OPS test_dygraph_recompute_for_eager)
list(REMOVE_ITEM TEST_OPS test_hybrid_parallel_inference_helper) list(REMOVE_ITEM TEST_OPS test_hybrid_parallel_inference_helper)
list(REMOVE_ITEM TEST_OPS test_parallel_class_center_sample) list(REMOVE_ITEM TEST_OPS test_parallel_class_center_sample)
list(REMOVE_ITEM TEST_OPS test_parallel_margin_cross_entropy) list(REMOVE_ITEM TEST_OPS test_parallel_margin_cross_entropy)
...@@ -1547,7 +1553,11 @@ if(WITH_DISTRIBUTE ...@@ -1547,7 +1553,11 @@ if(WITH_DISTRIBUTE
120) 120)
set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 200) set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 200)
set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 350) set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 350)
set_tests_properties(test_dygraph_sharding_stage3_for_eager PROPERTIES TIMEOUT
350)
set_tests_properties(test_dygraph_group_sharded_api PROPERTIES TIMEOUT 120) set_tests_properties(test_dygraph_group_sharded_api PROPERTIES TIMEOUT 120)
set_tests_properties(test_dygraph_group_sharded_api_for_eager
PROPERTIES TIMEOUT 120)
set_tests_properties(test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120) set_tests_properties(test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120)
set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120)
set_tests_properties(test_hybrid_parallel_inference_helper PROPERTIES TIMEOUT set_tests_properties(test_hybrid_parallel_inference_helper PROPERTIES TIMEOUT
...@@ -1637,6 +1647,8 @@ endif() ...@@ -1637,6 +1647,8 @@ endif()
if(WITH_GPU OR WITH_ROCM) if(WITH_GPU OR WITH_ROCM)
set_tests_properties(test_imperative_auto_mixed_precision PROPERTIES TIMEOUT set_tests_properties(test_imperative_auto_mixed_precision PROPERTIES TIMEOUT
300) 300)
set_tests_properties(test_imperative_auto_mixed_precision_for_eager
PROPERTIES TIMEOUT 300)
set_tests_properties(test_parallel_dygraph_sync_batch_norm PROPERTIES TIMEOUT set_tests_properties(test_parallel_dygraph_sync_batch_norm PROPERTIES TIMEOUT
120) 120)
set_tests_properties(test_rank_attention_op PROPERTIES TIMEOUT 120) set_tests_properties(test_rank_attention_op PROPERTIES TIMEOUT 120)
......
...@@ -21,7 +21,7 @@ import paddle ...@@ -21,7 +21,7 @@ import paddle
import numpy as np import numpy as np
import paddle.distributed as dist import paddle.distributed as dist
from paddle.fluid.dygraph.nn import Linear from paddle.fluid.dygraph.nn import Linear
from paddle.autograd import PyLayer, EagerPyLayer from paddle.autograd import PyLayer
from paddle.fluid.framework import in_dygraph_mode, _in_legacy_dygraph from paddle.fluid.framework import in_dygraph_mode, _in_legacy_dygraph
from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients
...@@ -45,21 +45,6 @@ class cus_tanh(PyLayer): ...@@ -45,21 +45,6 @@ class cus_tanh(PyLayer):
return grad return grad
class cus_tanh_eager(EagerPyLayer):
@staticmethod
def forward(ctx, x):
y = paddle.tanh(x)
ctx.save_for_backward(y)
return y
@staticmethod
def backward(ctx, dy):
y, = ctx.saved_tensor()
grad = dy * (1 - paddle.square(y))
return grad
class SimpleNet(paddle.nn.Layer): class SimpleNet(paddle.nn.Layer):
def __init__(self, train_id, model_id): def __init__(self, train_id, model_id):
...@@ -73,10 +58,7 @@ class SimpleNet(paddle.nn.Layer): ...@@ -73,10 +58,7 @@ class SimpleNet(paddle.nn.Layer):
def forward(self, inputs): def forward(self, inputs):
if self.model_id == 0: if self.model_id == 0:
if in_dygraph_mode(): inputs = cus_tanh.apply(inputs)
inputs = cus_tanh_eager.apply(inputs)
elif _in_legacy_dygraph():
inputs = cus_tanh.apply(inputs)
else: else:
inputs = self.tanh(inputs) inputs = self.tanh(inputs)
......
...@@ -15,6 +15,9 @@ ...@@ -15,6 +15,9 @@
from __future__ import print_function from __future__ import print_function
import os import os
os.environ['FLAGS_enable_eager_mode'] = '0'
import unittest import unittest
import paddle.fluid as fluid import paddle.fluid as fluid
...@@ -26,9 +29,7 @@ class TestDygraphGroupSharded(TestMultipleGpus): ...@@ -26,9 +29,7 @@ class TestDygraphGroupSharded(TestMultipleGpus):
# check group sharded logic as well as the accuracy with single mode # check group sharded logic as well as the accuracy with single mode
def test_dygraph_group_sharded(self): def test_dygraph_group_sharded(self):
self.run_mnist_2gpu('dygraph_group_sharded_api.py', eager_mode=False) self.run_mnist_2gpu('dygraph_group_sharded_api.py', eager_mode=False)
self.run_mnist_2gpu('dygraph_group_sharded_api_eager.py')
if __name__ == "__main__": if __name__ == "__main__":
os.environ["FLAGS_enable_eager_mode"] = "1"
unittest.main() unittest.main()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
os.environ['FLAGS_enable_eager_mode'] = '1'
import unittest
import paddle.fluid as fluid
from test_parallel_dygraph_dataparallel import TestMultipleGpus
class TestDygraphGroupSharded(TestMultipleGpus):
# check group sharded logic as well as the accuracy with single mode
def test_dygraph_group_sharded(self):
self.run_mnist_2gpu('dygraph_group_sharded_api_eager.py')
if __name__ == "__main__":
unittest.main()
...@@ -23,7 +23,6 @@ from paddle.distributed.fleet.utils import recompute ...@@ -23,7 +23,6 @@ from paddle.distributed.fleet.utils import recompute
import random import random
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
from paddle.fluid.framework import _test_eager_guard
def get_fc_block(block_idx, input_size, is_last=False): def get_fc_block(block_idx, input_size, is_last=False):
...@@ -181,34 +180,15 @@ class TestPyLayer(unittest.TestCase): ...@@ -181,34 +180,15 @@ class TestPyLayer(unittest.TestCase):
check_identical(loss_ref, param_ref, grad_ref, loss, param, grad) check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
def test_fc_net_with_dropout(self): def test_fc_net_with_dropout(self):
with _test_eager_guard():
self.test_base_case()
self.test_base_case() self.test_base_case()
def test_fc_net_without_restore_rng(self):
with _test_eager_guard():
loss_ref, param_ref, grad_ref = run_model(
recompute_block=[2],
recompute_kwargs={"preserve_rng_state": False},
enable_autocast=True)
def test_fc_net_with_amp(self): def test_fc_net_with_amp(self):
with _test_eager_guard():
self.test_base_case(enable_autocast=True)
self.test_base_case(enable_autocast=True) self.test_base_case(enable_autocast=True)
def test_fc_net_with_fp16(self): def test_fc_net_with_fp16(self):
with _test_eager_guard():
self.test_base_case(enable_autocast=True, pure_fp16=True)
self.test_base_case(enable_autocast=True, pure_fp16=True) self.test_base_case(enable_autocast=True, pure_fp16=True)
def test_recompute_kwargs(self): def test_recompute_kwargs(self):
with _test_eager_guard():
paddle.set_device("gpu")
kwargs = {"is_test": False}
with self.assertRaises(ValueError):
loss_ref, param_ref, grad_ref = run_model(
recompute_block=[2], recompute_kwargs=kwargs)
paddle.set_device("gpu") paddle.set_device("gpu")
kwargs = {"is_test": False} kwargs = {"is_test": False}
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
...@@ -216,11 +196,6 @@ class TestPyLayer(unittest.TestCase): ...@@ -216,11 +196,6 @@ class TestPyLayer(unittest.TestCase):
recompute_kwargs=kwargs) recompute_kwargs=kwargs)
def test_recompute_cpu_rng(self): def test_recompute_cpu_rng(self):
with _test_eager_guard():
paddle.set_device("cpu")
with self.assertRaises(RuntimeError):
loss_ref, param_ref, grad_ref = run_model(recompute_block=[2])
paddle.set_device("cpu") paddle.set_device("cpu")
with self.assertRaises(RuntimeError): with self.assertRaises(RuntimeError):
loss_ref, param_ref, grad_ref = run_model(recompute_block=[2]) loss_ref, param_ref, grad_ref = run_model(recompute_block=[2])
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
os.environ['FLAGS_enable_eager_mode'] = '1'
import unittest
import numpy as np
import paddle
from paddle.autograd import PyLayer
from paddle.distributed.fleet.utils import recompute
import random
import paddle.fluid.layers as layers
def get_fc_block(block_idx, input_size, is_last=False):
block_name = "block_" + str(block_idx)
block = paddle.nn.Sequential(
(block_name + "_fc_0",
paddle.nn.Linear(input_size, input_size, bias_attr=False)),
(block_name + "_dropout", paddle.nn.Dropout(p=0.5)),
(block_name + "_relu_1", paddle.nn.ReLU()),
(block_name + "_fc_1",
paddle.nn.Linear(input_size, input_size, bias_attr=False)),
(block_name + "_relu_2", paddle.nn.ReLU()),
)
if is_last:
block.add_sublayer(block_name + "_fc_2",
paddle.nn.Linear(input_size, 1,
bias_attr=False)) # add sublayer
else:
block.add_sublayer(block_name + "_fc_2",
paddle.nn.Linear(input_size,
input_size,
bias_attr=False)) # add sublayer
return block
class Naive_fc_net(paddle.nn.Layer):
def __init__(self,
input_size=10,
recompute_blocks=[1, 3],
recompute_kwargs={}):
super(Naive_fc_net, self).__init__()
self.recompute_blocks = recompute_blocks
self.recompute_kwargs = recompute_kwargs
self.runfunc0 = get_fc_block(0, input_size, is_last=False)
self.runfunc1 = get_fc_block(1, input_size, is_last=False)
self.runfunc2 = get_fc_block(2, input_size, is_last=False)
self.runfunc3 = get_fc_block(3, input_size, is_last=False)
self.runfunc4 = get_fc_block(4, input_size, is_last=True)
def forward(self, inputs):
if 0 in self.recompute_blocks:
inputs = recompute(self.runfunc0, inputs)
else:
inputs = self.runfunc0(inputs)
if 1 in self.recompute_blocks:
inputs = recompute(self.runfunc1, inputs)
else:
inputs = self.runfunc1(inputs)
if 2 in self.recompute_blocks:
inputs = recompute(self.runfunc2, inputs, **self.recompute_kwargs)
else:
inputs = self.runfunc2(inputs)
if 3 in self.recompute_blocks:
inputs = recompute(self.runfunc3, inputs)
else:
inputs = self.runfunc3(inputs)
if 4 in self.recompute_blocks:
inputs = recompute(self.runfunc4, inputs)
else:
inputs = self.runfunc4(inputs)
return inputs
def run_model(recompute_block=[],
recompute_kwargs={},
enable_autocast=False,
pure_fp16=False):
gen = paddle.seed(10)
gen.manual_seed(10)
np.random.seed(10)
random.seed(10)
batch_size, input_size = 1, 10
model = Naive_fc_net(input_size,
recompute_blocks=recompute_block,
recompute_kwargs=recompute_kwargs)
loss_fn = paddle.nn.MSELoss(reduction='mean')
optimizer = paddle.optimizer.SGD(learning_rate=0.01,
parameters=model.parameters())
if enable_autocast:
scaler = paddle.amp.GradScaler()
loss_ = []
param_ = []
grad_ = []
for step in range(10):
x_data = np.random.randn(batch_size, input_size).astype(np.float32)
x = paddle.to_tensor(x_data)
# x.stop_gradient = False
level = 'O2' if pure_fp16 else 'O1'
with paddle.amp.auto_cast(True, level=level):
y_pred = model(x)
loss = y_pred.mean()
if enable_autocast:
scaler.scale(loss).backward()
scaler.minimize(optimizer, loss)
else:
loss_.append(np.asarray(loss).tolist())
loss.backward()
optimizer.step()
param_.append(np.asarray(model.parameters()[9]).tolist())
grad_.append(np.asarray(model.parameters()[3]._grad_ivar()).tolist())
optimizer.clear_grad()
return loss_, param_, grad_
class TestPyLayer(unittest.TestCase):
def test_base_case(self, enable_autocast=False, pure_fp16=False):
def check_identical(loss_ref, param_ref, grad_ref, loss, param, grad):
self.assertEqual(loss_ref, loss)
self.assertEqual(param_ref, param)
self.assertEqual(grad_ref, grad)
# without recompute
loss_ref, param_ref, grad_ref = run_model(
recompute_block=[],
enable_autocast=enable_autocast,
pure_fp16=pure_fp16)
# recompute second block
loss, param, grad = run_model(recompute_block=[1],
enable_autocast=enable_autocast,
pure_fp16=pure_fp16)
check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
# recompute fourth block
loss, param, grad = run_model(recompute_block=[3],
enable_autocast=enable_autocast,
pure_fp16=pure_fp16)
check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
# recompute second to fourth block
loss, param, grad = run_model(recompute_block=[1, 2, 3],
enable_autocast=enable_autocast,
pure_fp16=pure_fp16)
check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
# recompute second & fourth block
loss, param, grad = run_model(recompute_block=[1, 3],
enable_autocast=enable_autocast,
pure_fp16=pure_fp16)
check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
def test_fc_net_with_dropout(self):
self.test_base_case()
def test_fc_net_without_restore_rng(self):
loss_ref, param_ref, grad_ref = run_model(
recompute_block=[2],
recompute_kwargs={"preserve_rng_state": False},
enable_autocast=True)
def test_fc_net_with_amp(self):
self.test_base_case(enable_autocast=True)
def test_fc_net_with_fp16(self):
self.test_base_case(enable_autocast=True, pure_fp16=True)
def test_recompute_kwargs(self):
paddle.set_device("gpu")
kwargs = {"is_test": False}
with self.assertRaises(ValueError):
loss_ref, param_ref, grad_ref = run_model(recompute_block=[2],
recompute_kwargs=kwargs)
def test_recompute_cpu_rng(self):
paddle.set_device("cpu")
with self.assertRaises(RuntimeError):
loss_ref, param_ref, grad_ref = run_model(recompute_block=[2])
if __name__ == '__main__':
unittest.main()
...@@ -15,6 +15,9 @@ ...@@ -15,6 +15,9 @@
from __future__ import print_function from __future__ import print_function
import os import os
os.environ['FLAGS_enable_eager_mode'] = '0'
import unittest import unittest
import paddle.fluid as fluid import paddle.fluid as fluid
...@@ -25,15 +28,12 @@ class TestDygraphShardingStage3(TestMultipleGpus): ...@@ -25,15 +28,12 @@ class TestDygraphShardingStage3(TestMultipleGpus):
# check sharding logic as well as the accuracy with single mode # check sharding logic as well as the accuracy with single mode
def test_dygraph_sharding_stage3(self): def test_dygraph_sharding_stage3(self):
self.run_mnist_2gpu('dygraph_group_sharded_stage3.py')
self.run_mnist_2gpu('dygraph_sharding_stage3.py', eager_mode=False) self.run_mnist_2gpu('dygraph_sharding_stage3.py', eager_mode=False)
def test_dygraph_sharding_stage3_offload(self): def test_dygraph_sharding_stage3_offload(self):
self.run_mnist_2gpu('dygraph_group_sharded_stage3_offload.py')
self.run_mnist_2gpu('dygraph_sharding_stage3_offload.py', self.run_mnist_2gpu('dygraph_sharding_stage3_offload.py',
eager_mode=False) eager_mode=False)
if __name__ == "__main__": if __name__ == "__main__":
os.environ["FLAGS_enable_eager_mode"] = "1"
unittest.main() unittest.main()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
os.environ['FLAGS_enable_eager_mode'] = '1'
import os
import unittest
import paddle.fluid as fluid
from test_parallel_dygraph_dataparallel import TestMultipleGpus
class TestDygraphShardingStage3(TestMultipleGpus):
# check sharding logic as well as the accuracy with single mode
def test_dygraph_sharding_stage3(self):
self.run_mnist_2gpu('dygraph_group_sharded_stage3.py')
def test_dygraph_sharding_stage3_offload(self):
self.run_mnist_2gpu('dygraph_group_sharded_stage3_offload.py')
if __name__ == "__main__":
os.environ["FLAGS_enable_eager_mode"] = "1"
unittest.main()
...@@ -12,6 +12,10 @@ ...@@ -12,6 +12,10 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os
os.environ['FLAGS_enable_eager_mode'] = '0'
import unittest import unittest
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
...@@ -19,13 +23,11 @@ import paddle.fluid.core as core ...@@ -19,13 +23,11 @@ import paddle.fluid.core as core
import numpy as np import numpy as np
import six import six
import cv2 import cv2
import os
import tempfile import tempfile
from test_imperative_resnet import ResNet, BottleneckBlock, ConvBNLayer, train_parameters, optimizer_setting from test_imperative_resnet import ResNet, BottleneckBlock, ConvBNLayer, train_parameters, optimizer_setting
import paddle.nn as nn import paddle.nn as nn
from paddle.static import InputSpec from paddle.static import InputSpec
from paddle.autograd import PyLayer from paddle.autograd import PyLayer
from paddle.fluid.framework import _test_eager_guard
if fluid.core.is_compiled_with_cuda(): if fluid.core.is_compiled_with_cuda():
fluid.set_flags({"FLAGS_cudnn_deterministic": True}) fluid.set_flags({"FLAGS_cudnn_deterministic": True})
...@@ -73,8 +75,6 @@ class TestAutoCast(unittest.TestCase): ...@@ -73,8 +75,6 @@ class TestAutoCast(unittest.TestCase):
self.assertTrue(out_fp32.dtype == fluid.core.VarDesc.VarType.FP32) self.assertTrue(out_fp32.dtype == fluid.core.VarDesc.VarType.FP32)
def test_amp_guard_white_op(self): def test_amp_guard_white_op(self):
with _test_eager_guard():
self.amp_guard_white_op()
self.amp_guard_white_op() self.amp_guard_white_op()
def amp_guard_black_op(self): def amp_guard_black_op(self):
...@@ -88,8 +88,6 @@ class TestAutoCast(unittest.TestCase): ...@@ -88,8 +88,6 @@ class TestAutoCast(unittest.TestCase):
self.assertTrue(out_fp32.dtype == fluid.core.VarDesc.VarType.FP32) self.assertTrue(out_fp32.dtype == fluid.core.VarDesc.VarType.FP32)
def test_amp_guard_black_op(self): def test_amp_guard_black_op(self):
with _test_eager_guard():
self.amp_guard_black_op()
self.amp_guard_black_op() self.amp_guard_black_op()
def custom_op_list(self): def custom_op_list(self):
...@@ -123,8 +121,6 @@ class TestAutoCast(unittest.TestCase): ...@@ -123,8 +121,6 @@ class TestAutoCast(unittest.TestCase):
| {"conv2d"}) | {"conv2d"})
def test_custom_op_list(self): def test_custom_op_list(self):
with _test_eager_guard():
self.custom_op_list()
self.custom_op_list() self.custom_op_list()
def custom_op_list_exception(self): def custom_op_list_exception(self):
...@@ -145,8 +141,6 @@ class TestAutoCast(unittest.TestCase): ...@@ -145,8 +141,6 @@ class TestAutoCast(unittest.TestCase):
self.assertRaises(ValueError, func) self.assertRaises(ValueError, func)
def test_custom_op_list_exception(self): def test_custom_op_list_exception(self):
with _test_eager_guard():
self.custom_op_list_exception()
self.custom_op_list_exception() self.custom_op_list_exception()
def amp_guard_upsupported_fp16_op(self): def amp_guard_upsupported_fp16_op(self):
...@@ -174,8 +168,6 @@ class TestAutoCast(unittest.TestCase): ...@@ -174,8 +168,6 @@ class TestAutoCast(unittest.TestCase):
out_purefp16_fp32.dtype == fluid.core.VarDesc.VarType.FP32) out_purefp16_fp32.dtype == fluid.core.VarDesc.VarType.FP32)
def test_amp_guard_upsupported_fp16_op(self): def test_amp_guard_upsupported_fp16_op(self):
with _test_eager_guard():
self.amp_guard_upsupported_fp16_op()
self.amp_guard_upsupported_fp16_op() self.amp_guard_upsupported_fp16_op()
def mode_exception(self): def mode_exception(self):
...@@ -195,8 +187,6 @@ class TestAutoCast(unittest.TestCase): ...@@ -195,8 +187,6 @@ class TestAutoCast(unittest.TestCase):
self.assertRaises(ValueError, func) self.assertRaises(ValueError, func)
def test_mode_exception(self): def test_mode_exception(self):
with _test_eager_guard():
self.mode_exception()
self.mode_exception() self.mode_exception()
...@@ -212,8 +202,6 @@ class TestAmpScaler(unittest.TestCase): ...@@ -212,8 +202,6 @@ class TestAmpScaler(unittest.TestCase):
data.numpy() * 1024), True) data.numpy() * 1024), True)
def test_scale(self): def test_scale(self):
with _test_eager_guard():
self.scale()
self.scale() self.scale()
def minimize(self): def minimize(self):
...@@ -265,8 +253,6 @@ class TestAmpScaler(unittest.TestCase): ...@@ -265,8 +253,6 @@ class TestAmpScaler(unittest.TestCase):
outs_no_scaler[1][i][0].numpy()), True) outs_no_scaler[1][i][0].numpy()), True)
def test_minimize(self): def test_minimize(self):
with _test_eager_guard():
self.minimize()
self.minimize() self.minimize()
def step(self): def step(self):
...@@ -310,8 +296,6 @@ class TestAmpScaler(unittest.TestCase): ...@@ -310,8 +296,6 @@ class TestAmpScaler(unittest.TestCase):
outs_no_scaler[i].numpy()), True) outs_no_scaler[i].numpy()), True)
def test_step(self): def test_step(self):
with _test_eager_guard():
self.step()
self.step() self.step()
def nan_inf(self): def nan_inf(self):
...@@ -344,8 +328,6 @@ class TestAmpScaler(unittest.TestCase): ...@@ -344,8 +328,6 @@ class TestAmpScaler(unittest.TestCase):
np.array_equal(param.numpy(), params_init[param.name])) np.array_equal(param.numpy(), params_init[param.name]))
def test_nan_inf(self): def test_nan_inf(self):
with _test_eager_guard():
self.nan_inf()
self.nan_inf() self.nan_inf()
def step_update_exception(self): def step_update_exception(self):
...@@ -396,8 +378,6 @@ class TestAmpScaler(unittest.TestCase): ...@@ -396,8 +378,6 @@ class TestAmpScaler(unittest.TestCase):
self.assertRaises(RuntimeError, func3) self.assertRaises(RuntimeError, func3)
def test_step_update_exception(self): def test_step_update_exception(self):
with _test_eager_guard():
self.step_update_exception()
self.step_update_exception() self.step_update_exception()
def test_get_and_set(self): def test_get_and_set(self):
...@@ -578,8 +558,6 @@ class TestGradScalerStateDict(unittest.TestCase): ...@@ -578,8 +558,6 @@ class TestGradScalerStateDict(unittest.TestCase):
self.assertTrue( self.assertTrue(
np.allclose(out_use_state_dict[0], out_no_state_dict[0])) np.allclose(out_use_state_dict[0], out_no_state_dict[0]))
with _test_eager_guard():
func_isinstance()
func_isinstance() func_isinstance()
...@@ -742,8 +720,6 @@ class TestStateDictHookForAMP(unittest.TestCase): ...@@ -742,8 +720,6 @@ class TestStateDictHookForAMP(unittest.TestCase):
for key in param_value_ori.keys(): for key in param_value_ori.keys():
print(np.equal(param_value_ori[key], param_value_now[key])) print(np.equal(param_value_ori[key], param_value_now[key]))
with _test_eager_guard():
func_isinstance()
func_isinstance() func_isinstance()
...@@ -899,8 +875,6 @@ class TestPureFp16SaveLoad(unittest.TestCase): ...@@ -899,8 +875,6 @@ class TestPureFp16SaveLoad(unittest.TestCase):
self.assertTrue( self.assertTrue(
np.allclose(out_use_save_load[0], out_no_save_load[0])) np.allclose(out_use_save_load[0], out_no_save_load[0]))
with _test_eager_guard():
func_isinstance()
func_isinstance() func_isinstance()
...@@ -1005,8 +979,6 @@ class TestPureFp16InferenceSaveLoad(unittest.TestCase): ...@@ -1005,8 +979,6 @@ class TestPureFp16InferenceSaveLoad(unittest.TestCase):
def test_inference_save_load(self): def test_inference_save_load(self):
self.inference_save_load() self.inference_save_load()
with _test_eager_guard():
self.inference_save_load()
class TestResnet2(unittest.TestCase): class TestResnet2(unittest.TestCase):
...@@ -1146,8 +1118,6 @@ class TestResnet2(unittest.TestCase): ...@@ -1146,8 +1118,6 @@ class TestResnet2(unittest.TestCase):
self.assertTrue( self.assertTrue(
np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-2)) np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-2))
with _test_eager_guard():
func_isinstance()
func_isinstance() func_isinstance()
def test_with_data_loader(self): def test_with_data_loader(self):
...@@ -1166,8 +1136,6 @@ class TestResnet2(unittest.TestCase): ...@@ -1166,8 +1136,6 @@ class TestResnet2(unittest.TestCase):
self.assertTrue( self.assertTrue(
np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-2)) np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-2))
with _test_eager_guard():
func_isinstance()
func_isinstance() func_isinstance()
def test_param_group(self): def test_param_group(self):
...@@ -1189,8 +1157,6 @@ class TestResnet2(unittest.TestCase): ...@@ -1189,8 +1157,6 @@ class TestResnet2(unittest.TestCase):
self.assertTrue( self.assertTrue(
np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-2)) np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-2))
with _test_eager_guard():
func_isinstance()
func_isinstance() func_isinstance()
...@@ -1285,8 +1251,6 @@ class TestResnet(unittest.TestCase): ...@@ -1285,8 +1251,6 @@ class TestResnet(unittest.TestCase):
self.assertTrue( self.assertTrue(
np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-1)) np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-1))
with _test_eager_guard():
func_isinstance()
func_isinstance() func_isinstance()
...@@ -1308,8 +1272,6 @@ class TestLayerNormFp16(unittest.TestCase): ...@@ -1308,8 +1272,6 @@ class TestLayerNormFp16(unittest.TestCase):
self.assertTrue( self.assertTrue(
out.dtype == fluid.core.VarDesc.VarType.FP16) out.dtype == fluid.core.VarDesc.VarType.FP16)
with _test_eager_guard():
func_isinstance()
func_isinstance() func_isinstance()
...@@ -1344,8 +1306,6 @@ class TestBf16(unittest.TestCase): ...@@ -1344,8 +1306,6 @@ class TestBf16(unittest.TestCase):
self.assertTrue( self.assertTrue(
np.allclose(out_fp32, out_bf16_O2, rtol=1.e-3, atol=1.e-1)) np.allclose(out_fp32, out_bf16_O2, rtol=1.e-3, atol=1.e-1))
with _test_eager_guard():
func_isinstance()
func_isinstance() func_isinstance()
...@@ -1399,8 +1359,6 @@ class TestAmpWithHook(unittest.TestCase): ...@@ -1399,8 +1359,6 @@ class TestAmpWithHook(unittest.TestCase):
loss = a.sum() loss = a.sum()
self.assertRaises(RuntimeError, loss.backward) self.assertRaises(RuntimeError, loss.backward)
with _test_eager_guard():
func_isinstance()
func_isinstance() func_isinstance()
def test_hook_change_place(self): def test_hook_change_place(self):
...@@ -1420,8 +1378,6 @@ class TestAmpWithHook(unittest.TestCase): ...@@ -1420,8 +1378,6 @@ class TestAmpWithHook(unittest.TestCase):
loss = a.sum() loss = a.sum()
self.assertRaises(RuntimeError, loss.backward) self.assertRaises(RuntimeError, loss.backward)
with _test_eager_guard():
func_isinstance()
func_isinstance() func_isinstance()
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
os.environ['FLAGS_enable_eager_mode'] = '1'
import unittest
import paddle
import paddle.fluid as fluid
import numpy as np
import six
import cv2
import tempfile
from test_imperative_resnet import ResNet, BottleneckBlock, ConvBNLayer, train_parameters, optimizer_setting
import paddle.nn as nn
from paddle.static import InputSpec
from paddle.autograd import PyLayer
if fluid.core.is_compiled_with_cuda():
fluid.set_flags({"FLAGS_cudnn_deterministic": True})
class SimpleConv(fluid.dygraph.Layer):
def __init__(self,
num_channels,
num_filters,
filter_size,
stride=1,
groups=1,
act=None):
super(SimpleConv, self).__init__()
self._conv = fluid.dygraph.Conv2D(num_channels=num_channels,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=groups,
act=None,
bias_attr=None,
use_cudnn=True)
def forward(self, inputs):
return self._conv(inputs)
class TestAutoCast(unittest.TestCase):
def amp_guard_white_op(self):
data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
with fluid.dygraph.guard():
conv2d = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
data = fluid.dygraph.to_variable(data)
with fluid.dygraph.amp_guard(True):
out_fp16 = conv2d(data)
with fluid.dygraph.amp_guard(False):
out_fp32 = conv2d(data)
self.assertTrue(data.dtype == fluid.core.VarDesc.VarType.FP32)
self.assertTrue(out_fp16.dtype == fluid.core.VarDesc.VarType.FP16)
self.assertTrue(out_fp32.dtype == fluid.core.VarDesc.VarType.FP32)
def test_amp_guard_white_op(self):
self.amp_guard_white_op()
def amp_guard_black_op(self):
data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
with fluid.dygraph.guard():
data = fluid.dygraph.to_variable(data)
with fluid.dygraph.amp_guard(True):
out_fp32 = fluid.layers.mean(data)
self.assertTrue(data.dtype == fluid.core.VarDesc.VarType.FP32)
self.assertTrue(out_fp32.dtype == fluid.core.VarDesc.VarType.FP32)
def test_amp_guard_black_op(self):
self.amp_guard_black_op()
def custom_op_list(self):
with fluid.dygraph.guard():
tracer = fluid.framework._dygraph_tracer()
base_white_list = fluid.dygraph.amp.auto_cast.WHITE_LIST
base_black_list = fluid.dygraph.amp.auto_cast.BLACK_LIST
with fluid.dygraph.amp_guard(custom_white_list=["log"],
custom_black_list=["conv2d"]):
white_list, black_list = tracer._get_amp_op_list()
self.assertTrue(
set(white_list) == (set(base_white_list) | {"log"}) -
{"conv2d"})
self.assertTrue(
set(black_list) == (set(base_black_list) - {"log"})
| {"conv2d"})
base_white_list = fluid.dygraph.amp.auto_cast.PURE_FP16_WHITE_LIST
base_black_list = fluid.dygraph.amp.auto_cast.PURE_FP16_BLACK_LIST
with fluid.dygraph.amp_guard(custom_white_list=["log"],
custom_black_list=["conv2d"],
level='O2'):
white_list, black_list = tracer._get_amp_op_list()
self.assertTrue(
set(white_list) == (set(base_white_list) | {"log"}) -
{"conv2d"})
self.assertTrue(
set(black_list) == (set(base_black_list) - {"log"})
| {"conv2d"})
def test_custom_op_list(self):
self.custom_op_list()
def custom_op_list_exception(self):
inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32)
def func():
with fluid.dygraph.guard():
model = SimpleConv(num_channels=3,
num_filters=64,
filter_size=7,
stride=2,
act='relu')
with fluid.dygraph.amp_guard(custom_white_list=["conv2d"],
custom_black_list=["conv2d"]):
inp = fluid.dygraph.to_variable(inp_np)
out = model(inp)
self.assertRaises(ValueError, func)
def test_custom_op_list_exception(self):
self.custom_op_list_exception()
def amp_guard_upsupported_fp16_op(self):
data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
with fluid.dygraph.guard():
conv2d = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
data = fluid.dygraph.to_variable(data)
with fluid.dygraph.amp_guard(True):
out_amp_fp16 = conv2d(data)
out_amp_fp32 = paddle.expand_as(
out_amp_fp16,
out_amp_fp16) # expand_as_v2 has no fp16 kernel
with fluid.dygraph.amp_guard(True, level='O2'):
out_purefp16_fp16 = conv2d(data)
out_purefp16_fp32 = paddle.expand_as(
out_purefp16_fp16,
out_purefp16_fp16) # expand_as_v2 has no fp16 kernel
self.assertTrue(data.dtype == fluid.core.VarDesc.VarType.FP32)
self.assertTrue(out_amp_fp16.dtype == fluid.core.VarDesc.VarType.FP16)
self.assertTrue(out_amp_fp32.dtype == fluid.core.VarDesc.VarType.FP32)
self.assertTrue(
out_purefp16_fp16.dtype == fluid.core.VarDesc.VarType.FP16)
self.assertTrue(
out_purefp16_fp32.dtype == fluid.core.VarDesc.VarType.FP32)
def test_amp_guard_upsupported_fp16_op(self):
self.amp_guard_upsupported_fp16_op()
def mode_exception(self):
def func():
data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
with fluid.dygraph.guard():
conv2d = fluid.dygraph.Conv2D(3,
2,
3,
bias_attr=False,
act=None)
data = fluid.dygraph.to_variable(data)
with fluid.dygraph.amp_guard(level='O'):
out = conv2d(data)
self.assertRaises(ValueError, func)
def test_mode_exception(self):
self.mode_exception()
class TestAmpScaler(unittest.TestCase):
def scale(self):
with fluid.dygraph.guard():
data = paddle.rand([10, 1024])
scaler = paddle.fluid.dygraph.AmpScaler(init_loss_scaling=1024)
scaled_data = scaler.scale(data)
self.assertEqual(
np.array_equal(scaled_data.numpy(),
data.numpy() * 1024), True)
def test_scale(self):
self.scale()
def minimize(self):
inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32)
def run_simple_conv(inp_np, use_scaler=True):
paddle.seed(10)
paddle.framework.random._manual_program_seed(10)
with fluid.dygraph.guard():
model = SimpleConv(num_channels=3,
num_filters=64,
filter_size=7,
stride=2,
act='relu')
optimizer = fluid.optimizer.SGDOptimizer(
learning_rate=0.01, parameter_list=model.parameters())
scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024)
data = fluid.dygraph.to_variable(inp_np)
out = model(data)
loss = fluid.layers.mean(out)
if use_scaler:
print('use scaler')
scaled_loss = scaler.scale(loss)
scaled_loss.backward()
optimize_ops, params_grads = scaler.minimize(
optimizer, scaled_loss)
else:
print('use no scaler')
loss.backward()
optimize_ops, params_grads = optimizer.minimize(loss)
return optimize_ops, params_grads
outs_with_scaler = run_simple_conv(inp_np, use_scaler=True)
outs_no_scaler = run_simple_conv(inp_np, use_scaler=False)
self.assertEqual(outs_with_scaler[0],
[]) # optimize_ops is [] in dygraph mode
self.assertEqual(outs_no_scaler[0],
[]) # optimize_ops is [] in dygraph mode
for i in range(len(outs_with_scaler[1])):
# check each grad
self.assertEqual(
np.allclose(outs_with_scaler[1][i][1].numpy(),
outs_no_scaler[1][i][1].numpy()), True)
# check each parameter
self.assertEqual(
np.allclose(outs_with_scaler[1][i][0].numpy(),
outs_no_scaler[1][i][0].numpy()), True)
def test_minimize(self):
self.minimize()
def step(self):
inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32)
def run_simple_conv(inp_np, use_scaler=True):
paddle.seed(10)
paddle.framework.random._manual_program_seed(10)
with fluid.dygraph.guard():
model = SimpleConv(num_channels=3,
num_filters=64,
filter_size=7,
stride=2,
act='relu')
optimizer = paddle.optimizer.SGD(learning_rate=0.01,
parameters=model.parameters())
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
data = fluid.dygraph.to_variable(inp_np)
out = model(data)
loss = fluid.layers.mean(out)
if use_scaler:
print('use scaler')
scaled_loss = scaler.scale(loss)
scaled_loss.backward()
scaler.step(optimizer)
scaler.update()
else:
print('use no scaler')
loss.backward()
optimizer.step()
return optimizer._parameter_list
outs_with_scaler = run_simple_conv(inp_np, use_scaler=True)
outs_no_scaler = run_simple_conv(inp_np, use_scaler=False)
for i in range(len(outs_with_scaler)):
# check each parameter
self.assertEqual(
np.allclose(outs_with_scaler[i].numpy(),
outs_no_scaler[i].numpy()), True)
def test_step(self):
self.step()
def nan_inf(self):
inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32)
inp_np[0][1][2][3] = np.nan
with fluid.dygraph.guard():
model = SimpleConv(num_channels=3,
num_filters=64,
filter_size=7,
stride=2,
act='relu')
params_init = {}
for param in model.parameters():
params_init[param.name] = param.numpy()
optimizer = fluid.optimizer.SGDOptimizer(
learning_rate=0.01, parameter_list=model.parameters())
scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024)
data = fluid.dygraph.to_variable(inp_np)
out = model(data)
loss = fluid.layers.mean(out)
scaled_loss = scaler.scale(loss)
scaled_loss.backward()
optimize_ops, params_grads = scaler.minimize(optimizer, scaled_loss)
self.assertEqual(scaler._found_inf.numpy() == 1, True)
for param in model.parameters():
# param not update when tensor contains nan or inf
self.assertTrue(
np.array_equal(param.numpy(), params_init[param.name]))
def test_nan_inf(self):
self.nan_inf()
def step_update_exception(self):
def func1():
model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
optimizer = paddle.optimizer.SGD(learning_rate=0.01,
parameters=model.parameters())
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
data = paddle.rand([10, 3, 32, 32])
conv = model(data)
loss = paddle.mean(conv)
scaled = scaler.scale(loss)
scaled.backward()
scaler.unscale_(optimizer)
scaler.unscale_(optimizer)
self.assertRaises(RuntimeError, func1)
def func2():
model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
optimizer = paddle.optimizer.SGD(learning_rate=0.01,
parameters=model.parameters())
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
data = paddle.rand([10, 3, 32, 32])
conv = model(data)
loss = paddle.mean(conv)
scaled = scaler.scale(loss)
scaled.backward()
scaler.step(optimizer)
scaler.unscale_(optimizer)
self.assertRaises(RuntimeError, func2)
def func3():
model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
optimizer = paddle.optimizer.SGD(learning_rate=0.01,
parameters=model.parameters())
scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
data = paddle.rand([10, 3, 32, 32])
conv = model(data)
loss = paddle.mean(conv)
scaled = scaler.scale(loss)
scaled.backward()
scaler.step(optimizer)
scaler.step(optimizer)
self.assertRaises(RuntimeError, func3)
def test_step_update_exception(self):
self.step_update_exception()
def test_get_and_set(self):
with fluid.dygraph.guard():
scaler = paddle.amp.GradScaler(enable=True,
init_loss_scaling=1024,
incr_ratio=2.0,
decr_ratio=0.5,
incr_every_n_steps=1000,
decr_every_n_nan_or_inf=2,
use_dynamic_loss_scaling=True)
self.assertEqual(scaler.is_enable() == True, True)
self.assertEqual(scaler.get_init_loss_scaling() == 1024, True)
self.assertEqual(scaler.get_incr_ratio() == 2.0, True)
self.assertEqual(scaler.get_decr_ratio() == 0.5, True)
self.assertEqual(scaler.get_incr_every_n_steps() == 1000, True)
self.assertEqual(scaler.get_decr_every_n_nan_or_inf() == 2, True)
self.assertEqual(scaler.is_use_dynamic_loss_scaling() == True, True)
scaler.set_decr_every_n_nan_or_inf(4)
self.assertEqual(scaler.get_decr_every_n_nan_or_inf() == 4, True)
scaler.set_decr_ratio(0.1)
self.assertEqual(scaler.get_decr_ratio() == 0.1, True)
scaler.set_incr_every_n_steps(200)
self.assertEqual(scaler.get_incr_every_n_steps() == 200, True)
scaler.set_incr_ratio(3.0)
self.assertEqual(scaler.get_incr_ratio() == 3.0, True)
scaler.set_init_loss_scaling(100)
self.assertEqual(scaler.get_init_loss_scaling() == 100, True)
def test_state_dict_and_load_state_dict(self):
with fluid.dygraph.guard():
scaler1 = paddle.amp.GradScaler(enable=True,
init_loss_scaling=14,
incr_ratio=233.0,
decr_ratio=0.523,
incr_every_n_steps=1090,
decr_every_n_nan_or_inf=20,
use_dynamic_loss_scaling=True)
scaler_state = scaler1.state_dict()
scaler2 = paddle.amp.GradScaler(enable=True)
scaler2.load_state_dict(scaler_state)
self.assertEqual(scaler2.get_init_loss_scaling() == 14, True)
self.assertEqual(scaler2.get_incr_ratio() == 233.0, True)
self.assertEqual(scaler2.get_decr_ratio() == 0.523, True)
self.assertEqual(scaler2.get_incr_every_n_steps() == 1090, True)
self.assertEqual(scaler2.get_decr_every_n_nan_or_inf() == 20, True)
scaler3 = paddle.amp.GradScaler(enable=False)
scaler3.load_state_dict(scaler_state)
self.assertEqual(scaler3.is_enable() == False, True)
def test_state_dict_and_load_state_dict_error(self):
def test_error():
state_empty = {}
scaler = paddle.amp.GradScaler(enable=True)
scaler.load_state_dict(state_empty)
self.assertRaises(RuntimeError, test_error)
def reader_decorator(reader):
def __reader__():
for item in reader():
img = np.array(item[0]).astype('float32').reshape(3, 224, 224)
label = np.array(item[1]).astype('int64').reshape(1)
yield img, label
return __reader__
class TestGradScalerStateDict(unittest.TestCase):
def train_resnet(self,
enable_amp=True,
use_data_loader=True,
use_save_load=True):
seed = 90
batch_size = train_parameters["batch_size"]
batch_num = 4
paddle.seed(seed)
paddle.framework.random._manual_program_seed(seed)
resnet = ResNet(use_cudnn=True)
optimizer = optimizer_setting(train_parameters,
parameter_list=resnet.parameters())
np.random.seed(seed)
train_reader = paddle.batch(
paddle.dataset.flowers.train(use_xmap=False), batch_size=batch_size)
dy_param_init_value = {}
for param in resnet.parameters():
dy_param_init_value[param.name] = param.numpy()
program = None
scaler = paddle.amp.GradScaler(enable=enable_amp,
init_loss_scaling=2.**10)
if use_data_loader:
train_reader = paddle.batch(reader_decorator(
paddle.dataset.flowers.train(use_xmap=False)),
batch_size=batch_size,
drop_last=True)
train_loader = fluid.io.DataLoader.from_generator(
capacity=4,
use_double_buffer=True,
iterable=True,
return_list=True)
train_loader.set_sample_list_generator(train_reader)
train_reader = train_loader
for batch_id, data in enumerate(train_reader()):
if batch_id >= batch_num:
break
if use_data_loader:
img, label = data
else:
dy_x_data = np.array([x[0].reshape(3, 224, 224)
for x in data]).astype('float32')
if len(np.array([x[1]
for x in data]).astype('int64')) != batch_size:
continue
y_data = np.array([x[1] for x in data
]).astype('int64').reshape(-1, 1)
img = paddle.to_tensor(dy_x_data)
label = paddle.to_tensor(y_data)
label.stop_gradient = True
with paddle.amp.auto_cast(enable=enable_amp):
out = resnet(img)
loss = paddle.nn.functional.cross_entropy(input=out, label=label)
avg_loss = paddle.mean(x=loss)
dy_out = avg_loss.numpy()
scaled_loss = scaler.scale(avg_loss)
scaled_loss.backward()
scaler.minimize(optimizer, scaled_loss)
dy_grad_value = {}
for param in resnet.parameters():
if param.trainable:
np_array = np.array(param._grad_ivar().value().get_tensor())
dy_grad_value[param.name +
fluid.core.grad_var_suffix()] = np_array
resnet.clear_gradients()
dy_param_value = {}
for param in resnet.parameters():
dy_param_value[param.name] = param.numpy()
if use_save_load and batch_id == 2:
paddle.save(scaler.state_dict(), 'ResNet_model.pdparams')
dict_load = paddle.load('ResNet_model.pdparams')
scaler.load_state_dict(dict_load)
if use_data_loader:
train_reader._reset()
return dy_out, dy_param_value, dy_grad_value
def test_with_state_dict(self):
def func_isinstance():
with fluid.dygraph.guard():
out_use_state_dict = self.train_resnet(enable_amp=True,
use_data_loader=True,
use_save_load=True)
out_no_state_dict = self.train_resnet(enable_amp=True,
use_data_loader=True,
use_save_load=False)
print('save_load:', out_use_state_dict[0], out_no_state_dict[0])
self.assertTrue(
np.allclose(out_use_state_dict[0], out_no_state_dict[0]))
func_isinstance()
class TestAmpDecorator(unittest.TestCase):
def test_mode_exception(self):
def func():
with fluid.dygraph.guard():
model = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
opt = paddle.optimizer.SGD(parameters=model.parameters())
model, opt = paddle.amp.decorate(models=model,
optimizers=opt,
level='O')
self.assertRaises(ValueError, func)
def test_input_type_exception(self):
def test_error_model():
class MyModel(object):
def __init__(self):
print("A fake Model")
model = MyModel()
with fluid.dygraph.guard():
paddle.amp.decorate(models=model, optimizers=None, level='O2')
self.assertRaises(TypeError, test_error_model)
def test_error_distributed_model():
model = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
model = paddle.DataParallel(model)
with fluid.dygraph.guard():
model = paddle.amp.decorate(models=model, level='O2')
self.assertRaises(RuntimeError, test_error_distributed_model)
def test_error_optimizer():
class MyOptimizer(object):
def __init__(self):
print("A fake Optimizer")
model = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
opt = MyOptimizer()
with fluid.dygraph.guard():
paddle.amp.decorate(models=model, optimizers=opt, level='O2')
self.assertRaises(TypeError, test_error_optimizer)
def test_set_master_weight(self):
model1 = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
opt1 = paddle.optimizer.Adam(learning_rate=0.0001,
parameters=model1.parameters(),
multi_precision=True)
model2 = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
opt2 = paddle.optimizer.Adam(learning_rate=0.0001,
parameters=model2.parameters(),
multi_precision=False)
model1, opt1 = paddle.amp.decorate(models=model1,
optimizers=opt1,
level='O2',
master_weight=None)
self.assertEqual(opt1._multi_precision, True)
models, opt2 = paddle.amp.decorate(models=[model1, model2],
optimizers=opt2,
level='O2',
master_weight=None)
self.assertEqual(opt2._multi_precision, True)
model3 = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
opt3 = paddle.optimizer.Adam(learning_rate=0.0001,
parameters=model3.parameters())
model4 = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
opt4 = paddle.optimizer.Adam(learning_rate=0.0001,
parameters=model4.parameters())
model3, opts = paddle.amp.decorate(models=model3,
optimizers=[opt3, opt4],
level='O2',
master_weight=True)
self.assertEqual(opts[0]._multi_precision, True)
self.assertEqual(opts[1]._multi_precision, True)
models = [model3, model4]
optimizers = [opt3, opt4]
models, optimizers = paddle.amp.decorate(models=models,
optimizers=optimizers,
level='O2',
master_weight=False)
self.assertEqual(optimizers[0]._multi_precision, False)
self.assertEqual(optimizers[1]._multi_precision, False)
def test_skip_BatchNorm_Layer_norm(self):
model = paddle.nn.LayerNorm(1)
model = paddle.amp.decorate(models=model, level='O2')
for param in model.parameters():
self.assertEqual((param.dtype == paddle.float32), True)
model = paddle.nn.BatchNorm(1)
model = paddle.amp.decorate(models=model, level='O2')
for param in model.parameters():
self.assertEqual((param.dtype == paddle.float32), True)
model = paddle.nn.BatchNorm1D(1)
model = paddle.amp.decorate(models=model, level='O2')
for param in model.parameters():
self.assertEqual((param.dtype == paddle.float32), True)
model = paddle.nn.BatchNorm2D(1)
model = paddle.amp.decorate(models=model, level='O2')
for param in model.parameters():
self.assertEqual((param.dtype == paddle.float32), True)
model = paddle.nn.BatchNorm3D(1)
model = paddle.amp.decorate(models=model, level='O2')
for param in model.parameters():
self.assertEqual((param.dtype == paddle.float32), True)
class TestStateDictHookForAMP(unittest.TestCase):
def test_state_dict_hook(self):
def func_isinstance():
paddle.seed(100)
model = paddle.nn.Linear(2, 4)
model = paddle.amp.decorate(models=model,
level='O2',
save_dtype='float32')
param_value_ori = {}
for param in model.parameters():
param_value_ori[param.name] = param.numpy()
state_dict = model.state_dict()
for key, value in state_dict.items():
state_dict[key] = value.cast("float16")
model.set_state_dict(state_dict)
param_value_now = {}
for param in model.parameters():
param_value_now[param.name] = param.numpy()
for key in param_value_ori.keys():
print(np.equal(param_value_ori[key], param_value_now[key]))
func_isinstance()
class TestPureFp16SaveLoad(unittest.TestCase):
def setUp(self):
self.temp_dir = tempfile.TemporaryDirectory()
def tearDown(self):
self.temp_dir.cleanup()
def test_save_dtype_exception(self):
def func():
paddle.disable_static()
model = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
opt = paddle.optimizer.SGD(parameters=model.parameters())
paddle.amp.decorate(models=model,
optimizers=opt,
level='O2',
save_dtype='int')
self.assertRaises(ValueError, func)
def train_resnet(self,
enable_amp=True,
use_data_loader=True,
use_save_load=True):
seed = 90
batch_size = train_parameters["batch_size"]
batch_num = 4
paddle.seed(seed)
paddle.framework.random._manual_program_seed(seed)
resnet = ResNet(use_cudnn=True)
optimizer = optimizer_setting(train_parameters,
parameter_list=resnet.parameters())
np.random.seed(seed)
train_reader = paddle.batch(
paddle.dataset.flowers.train(use_xmap=False), batch_size=batch_size)
dy_param_init_value = {}
for param in resnet.parameters():
dy_param_init_value[param.name] = param.numpy()
program = None
scaler = paddle.amp.GradScaler(enable=enable_amp,
init_loss_scaling=2.**10)
if use_data_loader:
train_reader = paddle.batch(reader_decorator(
paddle.dataset.flowers.train(use_xmap=False)),
batch_size=batch_size,
drop_last=True)
train_loader = fluid.io.DataLoader.from_generator(
capacity=4,
use_double_buffer=True,
iterable=True,
return_list=True)
train_loader.set_sample_list_generator(train_reader)
train_reader = train_loader
if enable_amp:
resnet, optimizer = paddle.amp.decorate(models=resnet,
optimizers=optimizer,
level='O2',
save_dtype='float32')
for batch_id, data in enumerate(train_reader()):
if batch_id >= batch_num:
break
if use_data_loader:
img, label = data
else:
dy_x_data = np.array([x[0].reshape(3, 224, 224)
for x in data]).astype('float32')
if len(np.array([x[1]
for x in data]).astype('int64')) != batch_size:
continue
y_data = np.array([x[1] for x in data
]).astype('int64').reshape(-1, 1)
img = paddle.to_tensor(dy_x_data)
label = paddle.to_tensor(y_data)
label.stop_gradient = True
with paddle.amp.auto_cast(enable=enable_amp, level='O2'):
out = resnet(img)
loss = paddle.nn.functional.cross_entropy(input=out, label=label)
loss = paddle.cast(loss, 'float32')
avg_loss = paddle.mean(x=loss)
dy_out = avg_loss.numpy()
scaled_loss = scaler.scale(avg_loss)
scaled_loss.backward()
scaler.minimize(optimizer, scaled_loss)
dy_grad_value = {}
for param in resnet.parameters():
if param.trainable:
np_array = np.array(param._grad_ivar().value().get_tensor())
dy_grad_value[param.name +
fluid.core.grad_var_suffix()] = np_array
resnet.clear_gradients()
dy_param_value = {}
for param in resnet.parameters():
dy_param_value[param.name] = param.numpy()
if use_save_load and batch_id == 2:
# paddle.save
obj = {
'model': resnet.state_dict(),
'opt': optimizer.state_dict(),
'scaler': scaler.state_dict()
}
path = os.path.join(self.temp_dir.name, 'model.pdparams')
paddle.save(obj, path)
# paddle.load
obj_load = paddle.load(path)
resnet = ResNet(use_cudnn=True)
optimizer = optimizer_setting(
train_parameters, parameter_list=resnet.parameters())
resnet.set_state_dict(obj_load['model'])
optimizer.set_state_dict(obj_load['opt'])
scaler.load_state_dict(obj_load['scaler'])
resnet, optimizer = paddle.amp.decorate(models=resnet,
optimizers=optimizer,
level='O2',
save_dtype='float32')
if use_data_loader:
train_reader._reset()
return dy_out, dy_param_value, dy_grad_value
def test_with_save_load(self):
def func_isinstance():
with fluid.dygraph.guard():
out_use_save_load = self.train_resnet(enable_amp=True,
use_data_loader=True,
use_save_load=True)
out_no_save_load = self.train_resnet(enable_amp=True,
use_data_loader=True,
use_save_load=False)
print('save_load:', out_use_save_load[0], out_no_save_load[0])
self.assertTrue(
np.allclose(out_use_save_load[0], out_no_save_load[0]))
func_isinstance()
class TestPureFp16InferenceSaveLoad(unittest.TestCase):
def setUp(self):
self.temp_dir = tempfile.TemporaryDirectory()
def tearDown(self):
self.temp_dir.cleanup()
def inference_save_load(self):
BATCH_SIZE = 16
BATCH_NUM = 4
EPOCH_NUM = 4
IMAGE_SIZE = 784
CLASS_NUM = 10
# define a random dataset
class RandomDataset(paddle.io.Dataset):
def __init__(self, num_samples):
self.num_samples = num_samples
def __getitem__(self, idx):
image = np.random.random([IMAGE_SIZE]).astype('float32')
label = np.random.randint(0, CLASS_NUM - 1,
(1, )).astype('int64')
return image, label
def __len__(self):
return self.num_samples
class LinearNet(nn.Layer):
def __init__(self):
super(LinearNet, self).__init__()
self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
def forward(self, x):
return self._linear(x)
def train(layer, loader, loss_fn, opt):
for epoch_id in range(EPOCH_NUM):
for batch_id, (image, label) in enumerate(loader()):
with paddle.amp.auto_cast(enable=True,
custom_white_list=None,
custom_black_list=None,
level='O2'):
out = layer(image)
loss = loss_fn(out, label)
loss.backward()
opt.step()
opt.clear_grad()
# train
layer = LinearNet()
adam = paddle.optimizer.Adam(learning_rate=0.001,
parameters=layer.parameters(),
multi_precision=True)
loss_fn = nn.CrossEntropyLoss()
layer, adam = paddle.amp.decorate(models=layer,
optimizers=adam,
save_dtype='float32')
dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
loader = paddle.io.DataLoader(dataset,
batch_size=BATCH_SIZE,
shuffle=True,
drop_last=True,
num_workers=2)
train(layer, loader, loss_fn, adam)
# save
path = os.path.join(self.temp_dir.name, 'example_model/linear')
paddle.jit.save(layer,
path,
input_spec=[InputSpec(shape=[IMAGE_SIZE], name='x')])
# jit.load
loaded_layer = paddle.jit.load(path)
# inference
loaded_layer.eval()
x = np.random.randn(1, IMAGE_SIZE).astype('float32')
x_tensor = paddle.to_tensor(x)
pred = loaded_layer(x_tensor)
# load_inference_model
paddle.enable_static()
exe = paddle.static.Executor()
[inference_program, feed_target_names,
fetch_targets] = (paddle.static.load_inference_model(path, exe))
tensor_img = x
results = exe.run(inference_program,
feed={feed_target_names[0]: tensor_img},
fetch_list=fetch_targets)
print("pred.numpy()", pred.numpy())
print("result", results[0])
self.assertTrue(np.array_equal(pred.numpy(), results[0]))
paddle.disable_static()
def test_inference_save_load(self):
self.inference_save_load()
class TestResnet2(unittest.TestCase):
"""
Use paddle-2.0 API
"""
def train_resnet(self,
enable_amp=True,
level='O1',
use_data_loader=False,
use_param_group=False):
seed = 90
batch_size = train_parameters["batch_size"]
batch_num = 10
paddle.seed(seed)
paddle.framework.random._manual_program_seed(seed)
resnet = ResNet(use_cudnn=True)
if use_param_group:
conv_params = resnet.conv.parameters()
other_params = []
for p in resnet.parameters():
contains = False
for q in conv_params:
if p is q:
contains = True
if not contains:
other_params.append(p)
# NOTE(zhiqiu): The Membership test operations(in / not in) calls "is" and "equal",
# see details: https://docs.python.org/3/reference/expressions.html#membership-test-operations.
# So do not use other_params = [p for p in resnet.parameters() if p not in conv_params]
optimizer = paddle.optimizer.Momentum(parameters=[{
'params':
conv_params,
'learning_rate':
0.01
}, {
'params':
other_params,
'learning_rate':
0.001
}],
multi_precision=True)
else:
optimizer = paddle.optimizer.SGD(parameters=resnet.parameters())
np.random.seed(seed)
train_reader = paddle.batch(
paddle.dataset.flowers.train(use_xmap=False), batch_size=batch_size)
dy_param_init_value = {}
for param in resnet.parameters():
dy_param_init_value[param.name] = param.numpy()
program = None
scaler = paddle.amp.GradScaler(enable=enable_amp,
init_loss_scaling=2.**10)
if use_data_loader:
train_reader = paddle.batch(reader_decorator(
paddle.dataset.flowers.train(use_xmap=False)),
batch_size=batch_size,
drop_last=True)
train_loader = fluid.io.DataLoader.from_generator(
capacity=4,
use_double_buffer=True,
iterable=True,
return_list=True)
train_loader.set_sample_list_generator(train_reader)
train_reader = train_loader
if enable_amp and (level == 'O2'):
resnet = paddle.amp.decorate(models=resnet, level='O2')
for batch_id, data in enumerate(train_reader()):
if batch_id >= batch_num:
break
if use_data_loader:
img, label = data
else:
dy_x_data = np.array([x[0].reshape(3, 224, 224)
for x in data]).astype('float32')
if len(np.array([x[1]
for x in data]).astype('int64')) != batch_size:
continue
y_data = np.array([x[1] for x in data
]).astype('int64').reshape(-1, 1)
img = paddle.to_tensor(dy_x_data)
label = paddle.to_tensor(y_data)
label.stop_gradient = True
with paddle.amp.auto_cast(enable=enable_amp, level=level):
out = resnet(img)
loss = paddle.nn.functional.cross_entropy(input=out, label=label)
loss = paddle.cast(loss, 'float32')
avg_loss = paddle.mean(x=loss)
dy_out = avg_loss.numpy()
scaled_loss = scaler.scale(avg_loss)
scaled_loss.backward()
scaler.unscale_(optimizer)
scaler.step(optimizer)
scaler.update()
dy_grad_value = {}
for param in resnet.parameters():
if param.trainable:
np_array = np.array(param._grad_ivar().value().get_tensor())
dy_grad_value[param.name +
fluid.core.grad_var_suffix()] = np_array
resnet.clear_gradients()
dy_param_value = {}
for param in resnet.parameters():
dy_param_value[param.name] = param.numpy()
if use_data_loader:
train_reader._reset()
return dy_out, dy_param_value, dy_grad_value
def test_resnet(self):
def func_isinstance():
with fluid.dygraph.guard():
out_fp32 = self.train_resnet(enable_amp=False)
out_amp = self.train_resnet(enable_amp=True)
out_pure_fp16 = self.train_resnet(enable_amp=True, level='O2')
print(out_fp32[0], out_amp[0], out_pure_fp16[0])
self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-5))
self.assertTrue(
np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-2))
func_isinstance()
def test_with_data_loader(self):
def func_isinstance():
with fluid.dygraph.guard():
out_fp32 = self.train_resnet(enable_amp=False,
use_data_loader=True)
out_amp = self.train_resnet(enable_amp=True,
use_data_loader=True)
out_pure_fp16 = self.train_resnet(enable_amp=True,
use_data_loader=True,
level='O2')
print(out_fp32[0], out_amp[0], out_pure_fp16[0])
self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-5))
self.assertTrue(
np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-2))
func_isinstance()
def test_param_group(self):
def func_isinstance():
with fluid.dygraph.guard():
out_fp32 = self.train_resnet(enable_amp=False,
use_data_loader=True,
use_param_group=True)
out_amp = self.train_resnet(enable_amp=True,
use_data_loader=True,
use_param_group=True)
out_pure_fp16 = self.train_resnet(enable_amp=True,
use_data_loader=True,
use_param_group=True,
level='O2')
print(out_fp32[0], out_amp[0], out_pure_fp16[0])
self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-5))
self.assertTrue(
np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-2))
func_isinstance()
class TestResnet(unittest.TestCase):
"""
Use paddle-1.x API
"""
def train_resnet(self, enable_amp=True, level='O1'):
seed = 90
batch_size = train_parameters["batch_size"]
batch_num = 1
with fluid.dygraph.guard():
paddle.seed(seed)
paddle.framework.random._manual_program_seed(seed)
resnet = ResNet(use_cudnn=True)
optimizer = optimizer_setting(train_parameters,
parameter_list=resnet.parameters())
optimizer = paddle.optimizer.Momentum(
parameters=resnet.parameters(), multi_precision=True)
np.random.seed(seed)
train_reader = paddle.batch(
paddle.dataset.flowers.train(use_xmap=False),
batch_size=batch_size)
dy_param_init_value = {}
for param in resnet.parameters():
dy_param_init_value[param.name] = param.numpy()
program = None
scaler = paddle.fluid.dygraph.AmpScaler(enable=enable_amp,
init_loss_scaling=2.**10)
if enable_amp and (level == 'O2'):
resnet, optimizer = paddle.fluid.dygraph.amp_decorate(
models=resnet, optimizers=optimizer, level='O2')
for batch_id, data in enumerate(train_reader()):
if batch_id >= batch_num:
break
dy_x_data = np.array([x[0].reshape(3, 224, 224)
for x in data]).astype('float32')
if len(np.array([x[1]
for x in data]).astype('int64')) != batch_size:
continue
y_data = np.array([x[1] for x in data
]).astype('int64').reshape(-1, 1)
img = fluid.dygraph.to_variable(dy_x_data)
label = fluid.dygraph.to_variable(y_data)
label.stop_gradient = True
with paddle.fluid.dygraph.amp_guard(enable=enable_amp,
level=level):
out = resnet(img)
loss = fluid.layers.cross_entropy(input=out, label=label)
avg_loss = fluid.layers.mean(x=loss)
dy_out = avg_loss.numpy()
scaled_loss = scaler.scale(avg_loss)
scaled_loss.backward()
scaler.minimize(optimizer, scaled_loss)
dy_grad_value = {}
for param in resnet.parameters():
if param.trainable:
np_array = np.array(
param._grad_ivar().value().get_tensor())
dy_grad_value[param.name +
fluid.core.grad_var_suffix()] = np_array
resnet.clear_gradients()
dy_param_value = {}
for param in resnet.parameters():
dy_param_value[param.name] = param.numpy()
return dy_out, dy_param_value, dy_grad_value
def test_resnet(self):
def func_isinstance():
out_fp32 = self.train_resnet(enable_amp=False)
out_amp = self.train_resnet(enable_amp=True)
out_pure_fp16 = self.train_resnet(enable_amp=True, level='O2')
print(out_fp32[0], out_amp[0], out_pure_fp16[0])
self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-2))
self.assertTrue(
np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-1))
func_isinstance()
class TestLayerNormFp16(unittest.TestCase):
r''' layer_norm and batch_norm support mixed inputs, i.e., only input x is fp16
and other params are fp32.
'''
def test_layer_norm_fp16(self):
def func_isinstance():
if fluid.is_compiled_with_cuda():
with fluid.dygraph.guard(fluid.CUDAPlace(0)):
x = paddle.rand([2, 2, 2, 3])
layer_norm = paddle.nn.LayerNorm(x.shape[1:])
with paddle.amp.auto_cast(custom_white_list=['layer_norm']):
out = layer_norm(x)
self.assertTrue(
out.dtype == fluid.core.VarDesc.VarType.FP16)
func_isinstance()
class TestBf16(unittest.TestCase):
'''
test amp for BF16
'''
def train(self, enable_amp=True, amp_level='O1'):
paddle.seed(100)
input = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
conv = paddle.nn.Conv2D(4, 6, (3, 3))
with paddle.amp.auto_cast(enable=enable_amp,
level=amp_level,
dtype='bfloat16'):
output = conv(input)
output = output.cast('float32')
return output.numpy()
def test_bf16(self):
def func_isinstance():
if fluid.core.is_compiled_with_cuda(
) and fluid.core.is_bfloat16_supported(paddle.CUDAPlace(0)):
out_fp32 = self.train(enable_amp=False)
out_bf16_O1 = self.train(enable_amp=True, amp_level='O1')
out_bf16_O2 = self.train(enable_amp=True, amp_level='O2')
self.assertTrue(
np.allclose(out_fp32, out_bf16_O1, rtol=1.e-3, atol=1.e-1))
self.assertTrue(
np.allclose(out_fp32, out_bf16_O2, rtol=1.e-3, atol=1.e-1))
func_isinstance()
class TestAmpWithPyLyer(unittest.TestCase):
def test_pylayer(self):
class MyMM(PyLayer):
@staticmethod
def forward(ctx, a, b):
ctx.save_for_backward(a, b)
return a.mm(b)
@staticmethod
def backward(ctx, grad):
a, b = ctx.saved_tensor()
# NOTE(zhiqiu): a and b is float32 now, while grad is fp16 when forward runs with auto_cast()
# thus, the mm operation raise errors because of the dtype of inputs are inconsistent before.
return grad.mm(b.t()), a.t().mm(grad)
x = paddle.rand([10, 10])
y = paddle.rand([10, 10])
x.stop_gradient = False
y.stop_gradient = False
# with paddle.amp.auto_cast():
res = MyMM.apply(x, y)
loss = paddle.mean(res)
loss.backward()
class TestAmpWithHook(unittest.TestCase):
def test_hook_change_dtype(self):
def func_isinstance():
with paddle.fluid.dygraph.guard():
v = paddle.rand([3, 3])
v.stop_gradient = False
def foo(grad):
print('grad', grad, grad.dtype) # grad's dtype is float32
res = paddle.mm(grad, grad) # mm runs in fp16
print('res', res, res.dtype) # res's dtype is float16
return res
v.register_hook(foo)
with paddle.amp.auto_cast():
a = paddle.mm(v, v)
loss = a.sum()
self.assertRaises(RuntimeError, loss.backward)
func_isinstance()
def test_hook_change_place(self):
def func_isinstance():
with paddle.fluid.dygraph.guard():
v = paddle.rand([3, 3])
v.stop_gradient = False
def foo(grad):
res = grad.cpu() # change place
return res
v.register_hook(foo)
with paddle.amp.auto_cast():
a = paddle.mm(v, v)
loss = a.sum()
self.assertRaises(RuntimeError, loss.backward)
func_isinstance()
if __name__ == '__main__':
unittest.main()
...@@ -18,7 +18,7 @@ import unittest ...@@ -18,7 +18,7 @@ import unittest
import numpy as np import numpy as np
import paddle import paddle
from paddle.autograd import PyLayer, EagerPyLayer from paddle.autograd.py_layer import LegacyPyLayer, EagerPyLayer
from paddle.fluid.framework import _test_eager_guard, in_dygraph_mode from paddle.fluid.framework import _test_eager_guard, in_dygraph_mode
...@@ -32,7 +32,7 @@ class TestPyLayer(unittest.TestCase): ...@@ -32,7 +32,7 @@ class TestPyLayer(unittest.TestCase):
def func_test_simple_pylayer_multiple_output(self): def func_test_simple_pylayer_multiple_output(self):
class tanh(EagerPyLayer if in_dygraph_mode() else PyLayer): class tanh(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer):
@staticmethod @staticmethod
def forward(ctx, x1, x2, func1, func2=paddle.square): def forward(ctx, x1, x2, func1, func2=paddle.square):
...@@ -70,7 +70,7 @@ class TestPyLayer(unittest.TestCase): ...@@ -70,7 +70,7 @@ class TestPyLayer(unittest.TestCase):
def func_test_simple_pylayer_return_none_with_no_grad(self): def func_test_simple_pylayer_return_none_with_no_grad(self):
class tanh(EagerPyLayer if in_dygraph_mode() else PyLayer): class tanh(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer):
@staticmethod @staticmethod
def forward(ctx, x1, x2, func1, func2=paddle.square): def forward(ctx, x1, x2, func1, func2=paddle.square):
...@@ -112,7 +112,7 @@ class TestPyLayer(unittest.TestCase): ...@@ -112,7 +112,7 @@ class TestPyLayer(unittest.TestCase):
def func_test_simple_pylayer_single_output(self): def func_test_simple_pylayer_single_output(self):
class tanh(EagerPyLayer if in_dygraph_mode() else PyLayer): class tanh(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer):
@staticmethod @staticmethod
def forward(ctx, x1, func1, func2=paddle.square): def forward(ctx, x1, func1, func2=paddle.square):
...@@ -146,7 +146,7 @@ class TestPyLayer(unittest.TestCase): ...@@ -146,7 +146,7 @@ class TestPyLayer(unittest.TestCase):
def func_test_pylayer_num_output_match(self): def func_test_pylayer_num_output_match(self):
class tanh(EagerPyLayer if in_dygraph_mode() else PyLayer): class tanh(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer):
@staticmethod @staticmethod
def forward( def forward(
...@@ -175,7 +175,7 @@ class TestPyLayer(unittest.TestCase): ...@@ -175,7 +175,7 @@ class TestPyLayer(unittest.TestCase):
def func_test_pylayer_dtype(self): def func_test_pylayer_dtype(self):
class tanh(EagerPyLayer if in_dygraph_mode() else PyLayer): class tanh(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer):
@staticmethod @staticmethod
def forward(ctx, x, dtype): def forward(ctx, x, dtype):
...@@ -206,7 +206,7 @@ class TestPyLayer(unittest.TestCase): ...@@ -206,7 +206,7 @@ class TestPyLayer(unittest.TestCase):
def func_test_pylayer_Exception_forward(self): def func_test_pylayer_Exception_forward(self):
class Layer_None1(EagerPyLayer if in_dygraph_mode() else PyLayer): class Layer_None1(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer):
@staticmethod @staticmethod
def forward(ctx, *args): def forward(ctx, *args):
...@@ -220,7 +220,7 @@ class TestPyLayer(unittest.TestCase): ...@@ -220,7 +220,7 @@ class TestPyLayer(unittest.TestCase):
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
z = Layer_None1.apply(input1) z = Layer_None1.apply(input1)
class Layer_None2(EagerPyLayer if in_dygraph_mode() else PyLayer): class Layer_None2(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer):
@staticmethod @staticmethod
def forward(ctx, *args): def forward(ctx, *args):
...@@ -234,7 +234,7 @@ class TestPyLayer(unittest.TestCase): ...@@ -234,7 +234,7 @@ class TestPyLayer(unittest.TestCase):
# return None # return None
z = Layer_None2.apply(input1) z = Layer_None2.apply(input1)
class Layer_one1(EagerPyLayer if in_dygraph_mode() else PyLayer): class Layer_one1(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer):
@staticmethod @staticmethod
def forward(ctx, *args): def forward(ctx, *args):
...@@ -249,7 +249,7 @@ class TestPyLayer(unittest.TestCase): ...@@ -249,7 +249,7 @@ class TestPyLayer(unittest.TestCase):
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
z = Layer_one1.apply(input1) z = Layer_one1.apply(input1)
class Layer_one2(EagerPyLayer if in_dygraph_mode() else PyLayer): class Layer_one2(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer):
@staticmethod @staticmethod
def forward(ctx, *args): def forward(ctx, *args):
...@@ -263,7 +263,7 @@ class TestPyLayer(unittest.TestCase): ...@@ -263,7 +263,7 @@ class TestPyLayer(unittest.TestCase):
# return int # return int
z = Layer_one2.apply(input1) z = Layer_one2.apply(input1)
class Layer_no_fw(EagerPyLayer if in_dygraph_mode() else PyLayer): class Layer_no_fw(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer):
@staticmethod @staticmethod
def backward(ctx, *args): def backward(ctx, *args):
...@@ -280,7 +280,7 @@ class TestPyLayer(unittest.TestCase): ...@@ -280,7 +280,7 @@ class TestPyLayer(unittest.TestCase):
def func_test_pylayer_nograd(self): def func_test_pylayer_nograd(self):
class tanh(EagerPyLayer if in_dygraph_mode() else PyLayer): class tanh(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer):
@staticmethod @staticmethod
def forward(ctx, x1, func1, func2=paddle.square, xx=None): def forward(ctx, x1, func1, func2=paddle.square, xx=None):
...@@ -305,7 +305,8 @@ class TestPyLayer(unittest.TestCase): ...@@ -305,7 +305,8 @@ class TestPyLayer(unittest.TestCase):
def func_test_pylayer_Exception_bk(self): def func_test_pylayer_Exception_bk(self):
class Layer_bk_none1(EagerPyLayer if in_dygraph_mode() else PyLayer): class Layer_bk_none1(
EagerPyLayer if in_dygraph_mode() else LegacyPyLayer):
@staticmethod @staticmethod
def forward(ctx, x): def forward(ctx, x):
...@@ -322,7 +323,8 @@ class TestPyLayer(unittest.TestCase): ...@@ -322,7 +323,8 @@ class TestPyLayer(unittest.TestCase):
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
z.sum().backward() z.sum().backward()
class Layer_bk_none2(EagerPyLayer if in_dygraph_mode() else PyLayer): class Layer_bk_none2(
EagerPyLayer if in_dygraph_mode() else LegacyPyLayer):
@staticmethod @staticmethod
def forward(ctx, x1, x2): def forward(ctx, x1, x2):
...@@ -339,7 +341,8 @@ class TestPyLayer(unittest.TestCase): ...@@ -339,7 +341,8 @@ class TestPyLayer(unittest.TestCase):
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
z.mean().backward() z.mean().backward()
class Layer_bk_one1(EagerPyLayer if in_dygraph_mode() else PyLayer): class Layer_bk_one1(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer
):
@staticmethod @staticmethod
def forward(ctx, x): def forward(ctx, x):
...@@ -356,7 +359,8 @@ class TestPyLayer(unittest.TestCase): ...@@ -356,7 +359,8 @@ class TestPyLayer(unittest.TestCase):
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
z.mean().backward() z.mean().backward()
class Layer_bk_one2(EagerPyLayer if in_dygraph_mode() else PyLayer): class Layer_bk_one2(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer
):
@staticmethod @staticmethod
def forward(ctx, x1, x2): def forward(ctx, x1, x2):
...@@ -374,7 +378,7 @@ class TestPyLayer(unittest.TestCase): ...@@ -374,7 +378,7 @@ class TestPyLayer(unittest.TestCase):
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
z.mean().backward() z.mean().backward()
class Layer_no_bk(EagerPyLayer if in_dygraph_mode() else PyLayer): class Layer_no_bk(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer):
@staticmethod @staticmethod
def forward(ctx, x): def forward(ctx, x):
...@@ -388,7 +392,8 @@ class TestPyLayer(unittest.TestCase): ...@@ -388,7 +392,8 @@ class TestPyLayer(unittest.TestCase):
z = z[0] + z[1] z = z[0] + z[1]
z.mean().backward() z.mean().backward()
class Layer_bk_match(EagerPyLayer if in_dygraph_mode() else PyLayer): class Layer_bk_match(
EagerPyLayer if in_dygraph_mode() else LegacyPyLayer):
@staticmethod @staticmethod
def forward(ctx, x): def forward(ctx, x):
...@@ -412,7 +417,8 @@ class TestPyLayer(unittest.TestCase): ...@@ -412,7 +417,8 @@ class TestPyLayer(unittest.TestCase):
def func_test_pylayer_bk_return_none(self): def func_test_pylayer_bk_return_none(self):
class Layer_bk_none1(EagerPyLayer if in_dygraph_mode() else PyLayer): class Layer_bk_none1(
EagerPyLayer if in_dygraph_mode() else LegacyPyLayer):
@staticmethod @staticmethod
def forward(ctx, x1, x2): def forward(ctx, x1, x2):
...@@ -431,7 +437,8 @@ class TestPyLayer(unittest.TestCase): ...@@ -431,7 +437,8 @@ class TestPyLayer(unittest.TestCase):
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
z.mean().backward() z.mean().backward()
class Layer_bk_none2(EagerPyLayer if in_dygraph_mode() else PyLayer): class Layer_bk_none2(
EagerPyLayer if in_dygraph_mode() else LegacyPyLayer):
@staticmethod @staticmethod
def forward(ctx, x1, x2): def forward(ctx, x1, x2):
...@@ -457,7 +464,7 @@ class TestPyLayer(unittest.TestCase): ...@@ -457,7 +464,7 @@ class TestPyLayer(unittest.TestCase):
def func_test_pylayer_inplace(self): def func_test_pylayer_inplace(self):
class cus_tanh(EagerPyLayer if in_dygraph_mode() else PyLayer): class cus_tanh(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer):
@staticmethod @staticmethod
def forward(ctx, x): def forward(ctx, x):
...@@ -494,7 +501,8 @@ class TestPyLayer(unittest.TestCase): ...@@ -494,7 +501,8 @@ class TestPyLayer(unittest.TestCase):
def test_pylayer_inplace_backward_error(self): def test_pylayer_inplace_backward_error(self):
with _test_eager_guard(): with _test_eager_guard():
class cus_tanh(EagerPyLayer if in_dygraph_mode() else PyLayer): class cus_tanh(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer
):
@staticmethod @staticmethod
def forward(ctx, x): def forward(ctx, x):
...@@ -530,7 +538,8 @@ class TestPyLayer(unittest.TestCase): ...@@ -530,7 +538,8 @@ class TestPyLayer(unittest.TestCase):
def test_pylayer_inplace_backward_success_1(self): def test_pylayer_inplace_backward_success_1(self):
with _test_eager_guard(): with _test_eager_guard():
class cus_tanh(EagerPyLayer if in_dygraph_mode() else PyLayer): class cus_tanh(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer
):
@staticmethod @staticmethod
def forward(ctx, x): def forward(ctx, x):
...@@ -564,7 +573,8 @@ class TestPyLayer(unittest.TestCase): ...@@ -564,7 +573,8 @@ class TestPyLayer(unittest.TestCase):
def test_pylayer_inplace_backward_success_2(self): def test_pylayer_inplace_backward_success_2(self):
with _test_eager_guard(): with _test_eager_guard():
class cus_tanh(EagerPyLayer if in_dygraph_mode() else PyLayer): class cus_tanh(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer
):
@staticmethod @staticmethod
def forward(ctx, x): def forward(ctx, x):
...@@ -597,7 +607,8 @@ class TestPyLayer(unittest.TestCase): ...@@ -597,7 +607,8 @@ class TestPyLayer(unittest.TestCase):
def func_test_pylayer_inplace_and_leaf_exception(self): def func_test_pylayer_inplace_and_leaf_exception(self):
class cus_pylayer_op(EagerPyLayer if in_dygraph_mode() else PyLayer): class cus_pylayer_op(
EagerPyLayer if in_dygraph_mode() else LegacyPyLayer):
@staticmethod @staticmethod
def forward(ctx, x): def forward(ctx, x):
...@@ -633,7 +644,7 @@ class TestPyLayer(unittest.TestCase): ...@@ -633,7 +644,7 @@ class TestPyLayer(unittest.TestCase):
def func_test_backward_in_backward(self): def func_test_backward_in_backward(self):
class cus_tanh(EagerPyLayer if in_dygraph_mode() else PyLayer): class cus_tanh(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer):
@staticmethod @staticmethod
def forward(ctx, x): def forward(ctx, x):
...@@ -665,7 +676,7 @@ class TestPyLayer(unittest.TestCase): ...@@ -665,7 +676,7 @@ class TestPyLayer(unittest.TestCase):
def func_test_return_to_tensor(self): def func_test_return_to_tensor(self):
class Tanh(EagerPyLayer if in_dygraph_mode() else PyLayer): class Tanh(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer):
@staticmethod @staticmethod
def forward(ctx, x1): def forward(ctx, x1):
...@@ -779,7 +790,7 @@ class TestPyLayerReturnType(unittest.TestCase): ...@@ -779,7 +790,7 @@ class TestPyLayerReturnType(unittest.TestCase):
def test_forward_args_fake_tensor(self): def test_forward_args_fake_tensor(self):
class Tanh(PyLayer): class Tanh(LegacyPyLayer):
@staticmethod @staticmethod
def forward(ctx, x1): def forward(ctx, x1):
...@@ -797,7 +808,7 @@ class TestPyLayerReturnType(unittest.TestCase): ...@@ -797,7 +808,7 @@ class TestPyLayerReturnType(unittest.TestCase):
def test_forward_kwargs_fake_tensor(self): def test_forward_kwargs_fake_tensor(self):
class Tanh(PyLayer): class Tanh(LegacyPyLayer):
@staticmethod @staticmethod
def forward(ctx, x1): def forward(ctx, x1):
...@@ -815,7 +826,7 @@ class TestPyLayerReturnType(unittest.TestCase): ...@@ -815,7 +826,7 @@ class TestPyLayerReturnType(unittest.TestCase):
def test_forward_return_fake_tensor(self): def test_forward_return_fake_tensor(self):
class Tanh(PyLayer): class Tanh(LegacyPyLayer):
@staticmethod @staticmethod
def forward(ctx, x1): def forward(ctx, x1):
...@@ -833,7 +844,7 @@ class TestPyLayerReturnType(unittest.TestCase): ...@@ -833,7 +844,7 @@ class TestPyLayerReturnType(unittest.TestCase):
def test_forward_return_fake_tensor_tuple(self): def test_forward_return_fake_tensor_tuple(self):
class Tanh(PyLayer): class Tanh(LegacyPyLayer):
@staticmethod @staticmethod
def forward(ctx, x1): def forward(ctx, x1):
...@@ -851,7 +862,7 @@ class TestPyLayerReturnType(unittest.TestCase): ...@@ -851,7 +862,7 @@ class TestPyLayerReturnType(unittest.TestCase):
def test_backward_return_fake_tensor_tuple(self): def test_backward_return_fake_tensor_tuple(self):
class Tanh(PyLayer): class Tanh(LegacyPyLayer):
@staticmethod @staticmethod
def forward(ctx, x1, x2): def forward(ctx, x1, x2):
...@@ -871,7 +882,7 @@ class TestPyLayerReturnType(unittest.TestCase): ...@@ -871,7 +882,7 @@ class TestPyLayerReturnType(unittest.TestCase):
def test_backward_return_fake_tensor(self): def test_backward_return_fake_tensor(self):
class Tanh(PyLayer): class Tanh(LegacyPyLayer):
@staticmethod @staticmethod
def forward(ctx, x1): def forward(ctx, x1):
......
...@@ -31,7 +31,7 @@ from paddle.distributed import alltoall, all_gather ...@@ -31,7 +31,7 @@ from paddle.distributed import alltoall, all_gather
from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
from paddle.distributed import fleet from paddle.distributed import fleet
from paddle.autograd import PyLayer, EagerPyLayer from paddle.autograd import PyLayer
from .gate import NaiveGate, GShardGate, SwitchGate, BaseGate from .gate import NaiveGate, GShardGate, SwitchGate, BaseGate
from .utils import count_by_gate from .utils import count_by_gate
from paddle.distributed.fleet.meta_parallel.pp_utils.utils import _hp_recompute from paddle.distributed.fleet.meta_parallel.pp_utils.utils import _hp_recompute
...@@ -132,53 +132,6 @@ class MoEScatter(PyLayer): ...@@ -132,53 +132,6 @@ class MoEScatter(PyLayer):
return grad_in, None, None, None return grad_in, None, None, None
class EagerMoEScatter(EagerPyLayer):
r"""
Scatter input samples from [batch x sequences] to contiguous alone experts.
If `world_size` is greater than 1, the samples will first be locally
scattered, and then exchanged across workers.
"""
@staticmethod
def forward(ctx,
inp,
pos,
local_expert_count,
global_expert_count,
fwd_batch_size,
world_size,
group=None):
local_input_buf = _local_scatter(inp, pos)
if world_size > 1:
global_input_buf = global_scatter(local_input_buf,
local_expert_count,
global_expert_count,
group=group)
else:
global_input_buf = local_input_buf
ctx.moe_args = inp.shape[0], world_size, group
variables = (pos, local_expert_count, global_expert_count)
ctx.save_for_backward(*variables)
return global_input_buf
@staticmethod
def backward(ctx, grad):
(pos, local_expert_count, global_expert_count) = ctx.saved_tensor()
(inp_batch_size, world_size, group) = ctx.moe_args
if world_size > 1:
local_grad_in = global_gather(grad,
local_expert_count,
global_expert_count,
group=group)
else:
local_grad_in = grad
grad_in = _local_gather(local_grad_in, pos, inp_batch_size)
return grad_in, None, None, None
class MoEGather(PyLayer): class MoEGather(PyLayer):
r""" r"""
Gather output samples from contiguous alone experts back to [batch x Gather output samples from contiguous alone experts back to [batch x
...@@ -226,53 +179,6 @@ class MoEGather(PyLayer): ...@@ -226,53 +179,6 @@ class MoEGather(PyLayer):
return global_grad_out_buf, None, None, None return global_grad_out_buf, None, None, None
class EagerMoEGather(EagerPyLayer):
r"""
Gather output samples from contiguous alone experts back to [batch x
sequences]. Works symmetrically with MoEScatter.
"""
@staticmethod
def forward(ctx,
global_output_buf,
pos,
local_expert_count,
global_expert_count,
local_batch_size,
world_size,
group=None):
if world_size > 1:
local_output_buf = global_gather(global_output_buf,
local_expert_count,
global_expert_count,
group=group)
else:
local_output_buf = global_output_buf
output = _local_gather(local_output_buf,
pos,
local_batch_size,
maybe_overlap=False)
ctx.moe_args = (global_output_buf.shape[0], world_size, group)
variables = (pos, local_expert_count, global_expert_count)
ctx.save_for_backward(*variables)
return output
@staticmethod
def backward(ctx, grad_out):
pos, local_expert_count, global_expert_count = ctx.saved_tensor()
fwd_batch_size, world_size, group = ctx.moe_args
grad_out_buf = _local_scatter(grad_out, pos)
if world_size > 1:
global_grad_out_buf = global_scatter(grad_out_buf,
local_expert_count,
global_expert_count,
group=group)
else:
global_grad_out_buf = grad_out_buf
return global_grad_out_buf, None, None, None
class AllGather(PyLayer): class AllGather(PyLayer):
r""" r"""
A wrapper for the All-Gather function to support auto-differentiation. A wrapper for the All-Gather function to support auto-differentiation.
...@@ -295,28 +201,6 @@ class AllGather(PyLayer): ...@@ -295,28 +201,6 @@ class AllGather(PyLayer):
ends=[(rank + 1) * dim0]) ends=[(rank + 1) * dim0])
class EagerAllGather(EagerPyLayer):
r"""
A wrapper for the All-Gather function to support auto-differentiation.
"""
@staticmethod
def forward(ctx, inp, rank, world_size, group):
tensor_list = []
paddle.distributed.all_gather(tensor_list, inp, group=group)
output = paddle.concat(tensor_list, axis=0)
ctx.args = rank, inp.shape[0]
return output
@staticmethod
def backward(ctx, grad_out):
rank, dim0 = ctx.args
return paddle.slice(grad_out,
axes=[0],
starts=[rank * dim0],
ends=[(rank + 1) * dim0])
class Slice(PyLayer): class Slice(PyLayer):
r""" r"""
A wrapper for the Slice function to support auto-differentiation. A wrapper for the Slice function to support auto-differentiation.
...@@ -341,30 +225,6 @@ class Slice(PyLayer): ...@@ -341,30 +225,6 @@ class Slice(PyLayer):
return _all_gather(grad_out, group=group) return _all_gather(grad_out, group=group)
class EagerSlice(EagerPyLayer):
r"""
A wrapper for the Slice function to support auto-differentiation.
"""
@staticmethod
def forward(ctx, inp, rank, world_size, group):
B = inp.shape[0]
local_batch_size = B // world_size
batch_start = local_batch_size * rank
batch_end = min(batch_start + local_batch_size, B)
inp = paddle.slice(inp,
axes=[0],
starts=[batch_start],
ends=[batch_end])
ctx.args = world_size, group
return inp
@staticmethod
def backward(ctx, grad_out):
world_size, group = ctx.args
return _all_gather(grad_out, group=group)
def prepare_forward(gate, num_expert, world_size, moe_group): def prepare_forward(gate, num_expert, world_size, moe_group):
pos, local_expert_count, global_expert_count = count_by_gate( pos, local_expert_count, global_expert_count = count_by_gate(
gate, num_expert, world_size, group=moe_group) gate, num_expert, world_size, group=moe_group)
...@@ -517,10 +377,7 @@ class MoELayer(nn.Layer): ...@@ -517,10 +377,7 @@ class MoELayer(nn.Layer):
mp_rank = self.mp_group.rank mp_rank = self.mp_group.rank
mp_size = self.mp_group.nranks mp_size = self.mp_group.nranks
if mp_size > 1: if mp_size > 1:
if in_dygraph_mode(): inp = Slice.apply(inp, mp_rank, mp_size, self.mp_group)
inp = EagerSlice.apply(inp, mp_rank, mp_size, self.mp_group)
else:
inp = Slice.apply(inp, mp_rank, mp_size, self.mp_group)
value, gate = self.gate(inp) value, gate = self.gate(inp)
( (
...@@ -541,14 +398,9 @@ class MoELayer(nn.Layer): ...@@ -541,14 +398,9 @@ class MoELayer(nn.Layer):
temp_pos = pos temp_pos = pos
assert topk == self.top_k assert topk == self.top_k
if in_dygraph_mode(): x = MoEScatter.apply(inp, temp_pos, local_expert_count,
x = EagerMoEScatter.apply(inp, temp_pos, local_expert_count, global_expert_count, fwd_batch_size,
global_expert_count, fwd_batch_size, self.world_size, self.group)
self.world_size, self.group)
else:
x = MoEScatter.apply(inp, temp_pos, local_expert_count,
global_expert_count, fwd_batch_size,
self.world_size, self.group)
d_model = self.d_model d_model = self.d_model
...@@ -577,23 +429,15 @@ class MoELayer(nn.Layer): ...@@ -577,23 +429,15 @@ class MoELayer(nn.Layer):
if len(gate.shape) == 2: if len(gate.shape) == 2:
out_batch_size *= gate.shape[1] out_batch_size *= gate.shape[1]
if in_dygraph_mode(): x = MoEGather.apply(x, pos, local_expert_count, global_expert_count,
x = EagerMoEGather.apply(x, pos, local_expert_count, out_batch_size, self.world_size, self.group)
global_expert_count, out_batch_size,
self.world_size, self.group)
else:
x = MoEGather.apply(x, pos, local_expert_count, global_expert_count,
out_batch_size, self.world_size, self.group)
x = x.reshape([-1, self.top_k, d_model]) x = x.reshape([-1, self.top_k, d_model])
value = value.reshape([x.shape[0], 1, self.top_k]) value = value.reshape([x.shape[0], 1, self.top_k])
x = paddle.bmm(value, x).reshape([-1, d_model]) x = paddle.bmm(value, x).reshape([-1, d_model])
if mp_size > 1: if mp_size > 1:
if in_dygraph_mode(): x = AllGather.apply(x, mp_rank, mp_size, self.mp_group)
x = EagerAllGather.apply(x, mp_rank, mp_size, self.mp_group)
else:
x = AllGather.apply(x, mp_rank, mp_size, self.mp_group)
x = paddle.reshape_(x, origin_shape) x = paddle.reshape_(x, origin_shape)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册