diff --git a/paddle/fluid/pybind/eager_py_layer.cc b/paddle/fluid/pybind/eager_py_layer.cc index 283134ccfc04b24ab1fce0bb84656a182bd3d267..25eb696085c09f93e294f1ea5be99a16712ecd27 100644 --- a/paddle/fluid/pybind/eager_py_layer.cc +++ b/paddle/fluid/pybind/eager_py_layer.cc @@ -129,16 +129,19 @@ PyObject* pylayer_method_apply(PyObject* cls, bool require_any_grad = false; size_t inputs_size = 0; + size_t args_size = 0; + size_t kwargs_size = 0; PyObject* forward_args = nullptr; PyObject* kwargs_value_list = nullptr; if (kwargs) { - inputs_size = PyDict_Size(kwargs); + kwargs_size = PyDict_Size(kwargs); kwargs_value_list = PyDict_Values(kwargs); - forward_args = PyTuple_New(1); - } else { - inputs_size = PyTuple_GET_SIZE(args); - forward_args = PyTuple_New(inputs_size + 1); } + if (args) { + args_size = PyTuple_GET_SIZE(args); + } + inputs_size = kwargs_size + args_size; + forward_args = PyTuple_New(args_size + 1); Py_INCREF(ctx); PyTuple_SET_ITEM(forward_args, 0, reinterpret_cast(ctx)); @@ -150,8 +153,8 @@ PyObject* pylayer_method_apply(PyObject* cls, ctx->forward_input_tensor_is_duplicable.reserve(inputs_size); for (size_t i = 0; i < inputs_size; i++) { PyObject* obj = nullptr; - if (kwargs) { - obj = PyList_GetItem(kwargs_value_list, i); + if (i >= args_size) { + obj = PyList_GetItem(kwargs_value_list, i - args_size); } else { obj = PyTuple_GET_ITEM(args, i); } @@ -212,7 +215,7 @@ PyObject* pylayer_method_apply(PyObject* cls, } } - if (!kwargs) { + if (i < args_size) { Py_INCREF(obj); PyTuple_SET_ITEM(forward_args, i + 1, obj); } diff --git a/python/paddle/autograd/__init__.py b/python/paddle/autograd/__init__.py index b13a4591b4ef23cca4bd50894aa99c5f3872e154..6669e4f4c70aaf820a994dafc9b1946f52104a07 100644 --- a/python/paddle/autograd/__init__.py +++ b/python/paddle/autograd/__init__.py @@ -17,7 +17,13 @@ from ..fluid.dygraph.base import no_grad_ as no_grad # noqa: F401 from ..framework import is_grad_enabled, set_grad_enabled # noqa: F401 from . import backward_mode # noqa: F401 from .backward_mode import backward # noqa: F401 -from .py_layer import PyLayer, PyLayerContext, EagerPyLayer, EagerPyLayerContext # noqa: F401 +from ..fluid.framework import _in_eager_mode_ +if _in_eager_mode_: + from .py_layer import EagerPyLayer as PyLayer # noqa: F401 + from .py_layer import EagerPyLayerContext as PyLayerContext # noqa: F401 +else: + from .py_layer import LegacyPyLayer as PyLayer # noqa: F401 + from .py_layer import LegacyPyLayerContext as PyLayerContext # noqa: F401 from ..framework import set_grad_enabled, is_grad_enabled # noqa: F401 from ..fluid.dygraph.base import no_grad_ as no_grad # noqa: F401 from .functional import vjp, jvp, Jacobian, Hessian # noqa: F401 diff --git a/python/paddle/autograd/py_layer.py b/python/paddle/autograd/py_layer.py index 64946268bd7c9e82e58c53ee2160c8aa994794ce..22fc8bf47c1840d7a87699725225639bddc0fb2b 100644 --- a/python/paddle/autograd/py_layer.py +++ b/python/paddle/autograd/py_layer.py @@ -21,7 +21,7 @@ from paddle.fluid import core __all__ = [] -class PyLayerContext(object): +class LegacyPyLayerContext(object): """ The object of this class is a context that is used in PyLayer to enhance the function. @@ -181,7 +181,7 @@ class CPyLayer(object): return core.pylayer_apply(place, cls, *args, **kwargs) -class PyLayerBackward(PyLayerContext): +class PyLayerBackward(LegacyPyLayerContext): def backward(self, *args, **kwargs): with paddle.fluid.dygraph.guard(): @@ -205,7 +205,7 @@ class LayerMeta(type): return super(LayerMeta, cls).__init__(name, bases, attrs) -class PyLayer(with_mateclass(LayerMeta, CPyLayer)): +class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)): """ Build a custom `Layer` by creating subclasses. Subclasses need to follow the following rules: 1. Subclasses contain `forward` and `backward` function. Both forward and backward are @staticmethod. @@ -425,6 +425,8 @@ class EagerPyLayerContext(object): Examples: .. code-block:: python + import os + os.environ['FLAGS_enable_eager_mode'] = '1' import paddle from paddle.autograd import PyLayer import numpy as np @@ -464,6 +466,8 @@ class EagerPyLayerContext(object): Examples: .. code-block:: python + import os + os.environ['FLAGS_enable_eager_mode'] = '1' import paddle from paddle.autograd import PyLayer import numpy as np diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index 1fd5bde1a54686d58d550780d3498dc0bfa7a5ae..cb634a4b6ac1a78bc09967ec1736afbe4d15c576 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -1181,9 +1181,9 @@ def _mp_allreduce(tensor, if in_dygraph_mode(): assert op == ReduceOp.SUM, "Unknown parameter: {}.".format(op) - from paddle.autograd import EagerPyLayer + from paddle.autograd import PyLayer - class mp_allreduce_eager(EagerPyLayer): + class mp_allreduce_eager(PyLayer): @staticmethod def forward(ctx, tensor, use_calc_stream, ring_id, diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py index d41f0fbb845706a8341ced2218755c897b65e4d7..f4f2076cd12b79282bc2e3e325bb546f8c40f227 100755 --- a/python/paddle/distributed/fleet/base/fleet_base.py +++ b/python/paddle/distributed/fleet/base/fleet_base.py @@ -37,7 +37,7 @@ from ..meta_optimizers import HybridParallelOptimizer, HeterParallelOptimizer from paddle import _C_ops from paddle.fluid import core from paddle.fluid.dygraph import to_variable -from paddle.distributed.fleet.utils.recompute import RecomputeFunction +from paddle.distributed.fleet.utils.recompute import LegacyRecomputeFunction from paddle.fluid.dygraph.varbase_patch_methods import _grad_scalar __all__ = [] @@ -68,7 +68,8 @@ class _RecomputeModelWrapper(paddle.nn.Layer): return do_run def _checkpoint(self, func, *args, **kwargs): - return RecomputeFunction.apply(func, self._preserve_rng_state, *args) + return LegacyRecomputeFunction.apply(func, self._preserve_rng_state, + *args) def forward(self, input): end = 0 diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py index 4fed58fe133ddc63ea323e4099a0e44d611c5020..46fe7e641733a5bc5768d835161f78f9e18d1a43 100644 --- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py +++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py @@ -17,7 +17,7 @@ import contextlib import paddle from paddle.fluid import core from paddle import _C_ops -from paddle.autograd import PyLayer, EagerPyLayer +from paddle.autograd import PyLayer from paddle.fluid import framework from ...utils.recompute import check_recompute_necessary, detach_variable, swith_rng_state_tracker from ..parallel_layers.random import get_rng_state_tracker @@ -151,7 +151,7 @@ def _merge_activation(tensor): return _all_gather(tensor, group=mp_group) -class _HPEagerRecomputeFunction(EagerPyLayer): +class _HPRecomputeFunction(PyLayer): """ Compared with paddle.distributed.fleet.utils.recompute, there are the following differences: 1. In order to support PipeLineParallel, the input of recompute is modified to ensure that the input can be tuple type. @@ -256,7 +256,7 @@ class _HPEagerRecomputeFunction(EagerPyLayer): detached_inputs = detach_variable(tuple(inputs)) outputs = ctx.run_function(*detached_inputs) - if isinstance(outputs, core.eager.Tensor): + if isinstance(outputs, (core.VarBase, core.eager.Tensor)): outputs = (outputs, ) assert len(outputs) == len(args) @@ -266,137 +266,8 @@ class _HPEagerRecomputeFunction(EagerPyLayer): for i in range(len(outputs)): if isinstance( outputs[i], - core.eager.Tensor) and not outputs[i].stop_gradient: - forward_outputs_with_grad.append(outputs[i]) - backward_inputs.append(args[i]) - - if len(forward_outputs_with_grad) == 0: - raise RuntimeError( - "none of output has stop_gradient=False, this recompute() is not necessary" - ) - - # actually backward - paddle.autograd.backward(forward_outputs_with_grad, backward_inputs) - grads = tuple(inp._grad_ivar() for inp in detached_inputs - if isinstance(inp, core.eager.Tensor)) - return grads - - -class _HPRecomputeFunction(PyLayer): - """ - Compared with paddle.distributed.fleet.utils.recompute, there are the following differences: - 1. In order to support PipeLineParallel, the input of recompute is modified to ensure that the input can be tuple type. - 2. Offload support for activation - 3. Support MP segmentation of activation to further reduce cuda memory - 4. Adapt to the random state of MP - """ - - @staticmethod - def forward(ctx, run_function, all_outputs, *args): - check_recompute_necessary(args) - - # store for recomputing - ctx.run_function = run_function - - # store the rng states - ctx.fwd_cuda_rng_state = paddle.get_cuda_rng_state() - ctx.fwd_cuda_rng_state_tracker = get_rng_state_tracker( - ).get_states_tracker() - - # save input for backward - ctx.inputs = [] - ctx.tensor_indices = [] - ctx.tensor_shapes = [] - tensor_inputs = [] - - cur_device = paddle.get_device() - assert 'gpu:' in paddle.get_device( - ), "Recompute with RNG is not support current device: {}.".format( - cur_device) - - # TODO support AMP - tracer = framework._dygraph_tracer() - ctx.is_fw_autocast = False if tracer._amp_level == core.AmpLevel.O0 else True - if tracer._amp_level == core.AmpLevel.O2: - ctx.amp_level = 'O2' - elif tracer._amp_level in (core.AmpLevel.O1, core.AmpLevel.O0): - ctx.amp_level = 'O1' - else: - raise ValueError("unsupported amp level: {}".format( - tracer._amp_level)) - ctx.amp_white_list, ctx.amp_black_list = tracer._get_amp_op_list() - - with paddle.no_grad(): - outputs = run_function(*args) - - for i, arg in enumerate(args): - if paddle.is_tensor(arg): - state = arg.stop_gradient - if _recompute_partition: - ctx.tensor_shapes.append(arg.shape) - partition = _split_activation(arg.detach()).clone() - # TODO(shenliang03) not use calculate stream to D2H to speed - arg = partition.cpu() if _recompute_offload else partition - else: - arg = arg.cpu() if _recompute_offload else arg - arg.stop_gradient = state - tensor_inputs.append(arg) - ctx.tensor_indices.append(i) - ctx.inputs.append(None) - else: - ctx.inputs.append(arg) - - ctx.save_for_backward(*tensor_inputs) - - if paddle.is_tensor(outputs): - all_outputs += [outputs] - return outputs - else: - all_outputs += outputs - return tuple(outputs) - - @staticmethod - def backward(ctx, *args): - with paddle.fluid.dygraph.guard(): - # Restore inputs - inputs = list(ctx.inputs) - tensor_indices = ctx.tensor_indices - tensor_shapes = ctx.tensor_shapes - tensors = list(ctx.saved_tensor()) - - device_id = paddle.distributed.ParallelEnv().device_id - for i, idx in enumerate(tensor_indices): - if _recompute_partition: - state = tensors[i].stop_gradient - tensors[i] = _merge_activation( - tensors[i]).detach().reshape_(tensor_shapes[i]) - tensors[i].stop_gradient = state - inputs[idx] = tensors[i].cuda( - device_id) if _recompute_offload else tensors[i] - - tracer = framework._dygraph_tracer() - tracer._has_grad = True - - # need restore auto_cast state as well as w/b list - with swith_rng_state_tracker(ctx.fwd_cuda_rng_state, - ctx.fwd_cuda_rng_state_tracker): - with paddle.amp.auto_cast(enable=ctx.is_fw_autocast, - custom_white_list=ctx.amp_white_list, - custom_black_list=ctx.amp_black_list, - level=ctx.amp_level): - detached_inputs = detach_variable(tuple(inputs)) - outputs = ctx.run_function(*detached_inputs) - - if isinstance(outputs, core.VarBase): - outputs = (outputs, ) - assert len(outputs) == len(args) - - forward_outputs_with_grad = [] - backward_inputs = [] - - for i in range(len(outputs)): - if isinstance(outputs[i], - core.VarBase) and not outputs[i].stop_gradient: + (core.VarBase, + core.eager.Tensor)) and not outputs[i].stop_gradient: forward_outputs_with_grad.append(outputs[i]) backward_inputs.append(args[i]) @@ -408,7 +279,7 @@ class _HPRecomputeFunction(PyLayer): # actually backward paddle.autograd.backward(forward_outputs_with_grad, backward_inputs) grads = tuple(inp._grad_ivar() for inp in detached_inputs - if isinstance(inp, core.VarBase)) + if isinstance(inp, (core.VarBase, core.eager.Tensor))) return grads @@ -420,10 +291,7 @@ def _hp_recompute(function, *args): # 3. Here, we only use float dtype to distinguish whether a gradient is needed in output tensor all_outputs = [] - if in_dygraph_mode(): - _HPEagerRecomputeFunction.apply(function, all_outputs, *args) - else: - _HPRecomputeFunction.apply(function, all_outputs, *args) + _HPRecomputeFunction.apply(function, all_outputs, *args) if len(all_outputs) == 1: return all_outputs[0] diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py index 0d6bfcf9224319cbec4b9647f7469e2c59aa4cea..abc5e0549ae668e62b6bade4b350163a0d7e9d48 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py @@ -20,7 +20,7 @@ from collections import OrderedDict import paddle from paddle import nn -from paddle.autograd import EagerPyLayer +from paddle.autograd import PyLayer import paddle.fluid.core as core import paddle.fluid.framework as framework from paddle.fluid.framework import EagerParamBase @@ -398,7 +398,7 @@ class GroupShardedStage3(nn.Layer): def _register_forward_hooks(self, layer): """ - Register EagerPyLayer to manage memory slices. + Register PyLayer to manage memory slices. There are four stages: FW 1. Before the forward layers, synchronize the full parameters. @@ -653,7 +653,7 @@ def ForwardPreHooks(layer, order_tracer, trainable_params, param2buffer_size, return -class ForwardPostHooks(EagerPyLayer): +class ForwardPostHooks(PyLayer): @staticmethod def forward(ctx, inputs, layer, order_tracer, trainable_params, diff --git a/python/paddle/distributed/fleet/utils/recompute.py b/python/paddle/distributed/fleet/utils/recompute.py index 1f4439cf1171f9dd51f05d98ff969a8960b8873d..f0c74159488a782a556c8fab5f303ed0cead31b4 100755 --- a/python/paddle/distributed/fleet/utils/recompute.py +++ b/python/paddle/distributed/fleet/utils/recompute.py @@ -14,7 +14,8 @@ import paddle from paddle.fluid import core -from paddle.autograd import PyLayer, EagerPyLayer +from paddle.autograd import PyLayer +from paddle.autograd.py_layer import LegacyPyLayer from paddle.fluid import framework import contextlib @@ -68,7 +69,7 @@ def swith_rng_state_tracker(rng_state, tracker): get_rng_state_tracker().set_states_tracker(orig_cuda_rng_tracker) -class EagerRecomputeFunction(EagerPyLayer): +class LegacyRecomputeFunction(LegacyPyLayer): @staticmethod def forward(ctx, run_function, preserve_rng_state, *args): @@ -171,7 +172,7 @@ class EagerRecomputeFunction(EagerPyLayer): detached_inputs = detach_variable(tuple(inputs)) outputs = ctx.run_function(*detached_inputs) - if isinstance(outputs, core.eager.Tensor): + if isinstance(outputs, core.VarBase): outputs = (outputs, ) assert len(outputs) == len(args) @@ -183,9 +184,8 @@ class EagerRecomputeFunction(EagerPyLayer): # the following backward_inputs_with_grad is used to avoid this case. backward_inputs_with_grad = [] for i in range(len(outputs)): - if isinstance( - outputs[i], - core.eager.Tensor) and not outputs[i].stop_gradient: + if isinstance(outputs[i], + core.VarBase) and not outputs[i].stop_gradient: forward_outputs_with_grad.append(outputs[i]) backward_inputs_with_grad.append(args[i]) @@ -199,8 +199,8 @@ class EagerRecomputeFunction(EagerPyLayer): paddle.autograd.backward(forward_outputs_with_grad, backward_inputs_with_grad) - grads = tuple(inp.grad for inp in detached_inputs - if isinstance(inp, core.eager.Tensor)) + grads = list(inp._grad_ivar() for inp in detached_inputs + if isinstance(inp, core.VarBase)) return grads @@ -307,7 +307,7 @@ class RecomputeFunction(PyLayer): detached_inputs = detach_variable(tuple(inputs)) outputs = ctx.run_function(*detached_inputs) - if isinstance(outputs, core.VarBase): + if isinstance(outputs, (core.VarBase, core.eager.Tensor)): outputs = (outputs, ) assert len(outputs) == len(args) @@ -319,8 +319,10 @@ class RecomputeFunction(PyLayer): # the following backward_inputs_with_grad is used to avoid this case. backward_inputs_with_grad = [] for i in range(len(outputs)): - if isinstance(outputs[i], - core.VarBase) and not outputs[i].stop_gradient: + if isinstance( + outputs[i], + (core.VarBase, + core.eager.Tensor)) and not outputs[i].stop_gradient: forward_outputs_with_grad.append(outputs[i]) backward_inputs_with_grad.append(args[i]) @@ -334,8 +336,14 @@ class RecomputeFunction(PyLayer): paddle.autograd.backward(forward_outputs_with_grad, backward_inputs_with_grad) - grads = list(inp._grad_ivar() for inp in detached_inputs - if isinstance(inp, core.VarBase)) + if in_dygraph_mode(): + grads = tuple( + inp._grad_ivar() for inp in detached_inputs + if isinstance(inp, (core.VarBase, core.eager.Tensor))) + else: + grads = list( + inp._grad_ivar() for inp in detached_inputs + if isinstance(inp, (core.VarBase, core.eager.Tensor))) return grads @@ -465,7 +473,4 @@ def recompute(function, *args, **kwargs): if framework._dygraph_tracer()._has_grad: check_recompute_necessary(args) - if in_dygraph_mode(): - return EagerRecomputeFunction.apply(function, preserve, *args) - else: - return RecomputeFunction.apply(function, preserve, *args) + return RecomputeFunction.apply(function, preserve, *args) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index cf717bd84fa0c9ad9274ad3ece7fea057dac50ca..6df9c8c4269caebc21730360e641f094fca268a4 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -60,7 +60,9 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_sharding_parallel) list(APPEND DIST_TEST_OPS test_dygraph_sharding_optimizer_stage2) list(APPEND DIST_TEST_OPS test_dygraph_sharding_stage2) list(APPEND DIST_TEST_OPS test_dygraph_sharding_stage3) +list(APPEND DIST_TEST_OPS test_dygraph_sharding_stage3_for_eager) list(APPEND DIST_TEST_OPS test_dygraph_group_sharded_api) +list(APPEND DIST_TEST_OPS test_dygraph_group_sharded_api_for_eager) list(APPEND DIST_TEST_OPS test_auto_parallel_parallelizer) list(APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers) list(APPEND DIST_TEST_OPS test_hybrid_parallel_inference_helper) @@ -305,13 +307,17 @@ if((NOT WITH_GPU) AND (NOT WITH_ROCM)) list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_optimizer_stage2) list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage2) list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage3) + list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage3_for_eager) list(REMOVE_ITEM TEST_OPS test_dygraph_group_sharded_api) + list(REMOVE_ITEM TEST_OPS test_dygraph_group_sharded_api_for_eager) list(REMOVE_ITEM TEST_OPS test_auto_parallel_parallelizer) list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mp_layers) list(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision) + list(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision_for_eager) list(REMOVE_ITEM TEST_OPS test_mixed_precision) list(REMOVE_ITEM TEST_OPS test_fleet_base_single) list(REMOVE_ITEM TEST_OPS test_dygraph_recompute) + list(REMOVE_ITEM TEST_OPS test_dygraph_recompute_for_eager) list(REMOVE_ITEM TEST_OPS test_hybrid_parallel_inference_helper) list(REMOVE_ITEM TEST_OPS test_parallel_class_center_sample) list(REMOVE_ITEM TEST_OPS test_parallel_margin_cross_entropy) @@ -1547,7 +1553,11 @@ if(WITH_DISTRIBUTE 120) set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 200) set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 350) + set_tests_properties(test_dygraph_sharding_stage3_for_eager PROPERTIES TIMEOUT + 350) set_tests_properties(test_dygraph_group_sharded_api PROPERTIES TIMEOUT 120) + set_tests_properties(test_dygraph_group_sharded_api_for_eager + PROPERTIES TIMEOUT 120) set_tests_properties(test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120) set_tests_properties(test_hybrid_parallel_inference_helper PROPERTIES TIMEOUT @@ -1637,6 +1647,8 @@ endif() if(WITH_GPU OR WITH_ROCM) set_tests_properties(test_imperative_auto_mixed_precision PROPERTIES TIMEOUT 300) + set_tests_properties(test_imperative_auto_mixed_precision_for_eager + PROPERTIES TIMEOUT 300) set_tests_properties(test_parallel_dygraph_sync_batch_norm PROPERTIES TIMEOUT 120) set_tests_properties(test_rank_attention_op PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_with_pylayer.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_with_pylayer.py index 049c3a0858a847accca4e1a3a380937cd590bb82..2c5a873a3343a349895f06e31983d254c08cdfda 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_with_pylayer.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_with_pylayer.py @@ -21,7 +21,7 @@ import paddle import numpy as np import paddle.distributed as dist from paddle.fluid.dygraph.nn import Linear -from paddle.autograd import PyLayer, EagerPyLayer +from paddle.autograd import PyLayer from paddle.fluid.framework import in_dygraph_mode, _in_legacy_dygraph from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients @@ -45,21 +45,6 @@ class cus_tanh(PyLayer): return grad -class cus_tanh_eager(EagerPyLayer): - - @staticmethod - def forward(ctx, x): - y = paddle.tanh(x) - ctx.save_for_backward(y) - return y - - @staticmethod - def backward(ctx, dy): - y, = ctx.saved_tensor() - grad = dy * (1 - paddle.square(y)) - return grad - - class SimpleNet(paddle.nn.Layer): def __init__(self, train_id, model_id): @@ -73,10 +58,7 @@ class SimpleNet(paddle.nn.Layer): def forward(self, inputs): if self.model_id == 0: - if in_dygraph_mode(): - inputs = cus_tanh_eager.apply(inputs) - elif _in_legacy_dygraph(): - inputs = cus_tanh.apply(inputs) + inputs = cus_tanh.apply(inputs) else: inputs = self.tanh(inputs) diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py b/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py index 0a51045dee5e1550d38e56924d8e39a239b8fc43..17183e95f2a7af2eb2dc51d28a344415a7aec2f6 100644 --- a/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py +++ b/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py @@ -15,6 +15,9 @@ from __future__ import print_function import os + +os.environ['FLAGS_enable_eager_mode'] = '0' + import unittest import paddle.fluid as fluid @@ -26,9 +29,7 @@ class TestDygraphGroupSharded(TestMultipleGpus): # check group sharded logic as well as the accuracy with single mode def test_dygraph_group_sharded(self): self.run_mnist_2gpu('dygraph_group_sharded_api.py', eager_mode=False) - self.run_mnist_2gpu('dygraph_group_sharded_api_eager.py') if __name__ == "__main__": - os.environ["FLAGS_enable_eager_mode"] = "1" unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api_for_eager.py b/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api_for_eager.py new file mode 100644 index 0000000000000000000000000000000000000000..a8dd842cd1f85526502d81ce9ddc56db33ecb5fd --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api_for_eager.py @@ -0,0 +1,35 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os + +os.environ['FLAGS_enable_eager_mode'] = '1' + +import unittest +import paddle.fluid as fluid + +from test_parallel_dygraph_dataparallel import TestMultipleGpus + + +class TestDygraphGroupSharded(TestMultipleGpus): + + # check group sharded logic as well as the accuracy with single mode + def test_dygraph_group_sharded(self): + self.run_mnist_2gpu('dygraph_group_sharded_api_eager.py') + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py b/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py index 799555a7b03d8db5a572b50d4307a678f136391a..11ca15fd33104b64cc9fb2ca6b6aee14e2f6d2cb 100755 --- a/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py +++ b/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py @@ -23,7 +23,6 @@ from paddle.distributed.fleet.utils import recompute import random import paddle.fluid.layers as layers -from paddle.fluid.framework import _test_eager_guard def get_fc_block(block_idx, input_size, is_last=False): @@ -181,34 +180,15 @@ class TestPyLayer(unittest.TestCase): check_identical(loss_ref, param_ref, grad_ref, loss, param, grad) def test_fc_net_with_dropout(self): - with _test_eager_guard(): - self.test_base_case() self.test_base_case() - def test_fc_net_without_restore_rng(self): - with _test_eager_guard(): - loss_ref, param_ref, grad_ref = run_model( - recompute_block=[2], - recompute_kwargs={"preserve_rng_state": False}, - enable_autocast=True) - def test_fc_net_with_amp(self): - with _test_eager_guard(): - self.test_base_case(enable_autocast=True) self.test_base_case(enable_autocast=True) def test_fc_net_with_fp16(self): - with _test_eager_guard(): - self.test_base_case(enable_autocast=True, pure_fp16=True) self.test_base_case(enable_autocast=True, pure_fp16=True) def test_recompute_kwargs(self): - with _test_eager_guard(): - paddle.set_device("gpu") - kwargs = {"is_test": False} - with self.assertRaises(ValueError): - loss_ref, param_ref, grad_ref = run_model( - recompute_block=[2], recompute_kwargs=kwargs) paddle.set_device("gpu") kwargs = {"is_test": False} with self.assertRaises(ValueError): @@ -216,11 +196,6 @@ class TestPyLayer(unittest.TestCase): recompute_kwargs=kwargs) def test_recompute_cpu_rng(self): - with _test_eager_guard(): - paddle.set_device("cpu") - with self.assertRaises(RuntimeError): - loss_ref, param_ref, grad_ref = run_model(recompute_block=[2]) - paddle.set_device("cpu") with self.assertRaises(RuntimeError): loss_ref, param_ref, grad_ref = run_model(recompute_block=[2]) diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_recompute_for_eager.py b/python/paddle/fluid/tests/unittests/test_dygraph_recompute_for_eager.py new file mode 100755 index 0000000000000000000000000000000000000000..bc97d53485be99597bf0902870137fc1c6c0361d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dygraph_recompute_for_eager.py @@ -0,0 +1,215 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os + +os.environ['FLAGS_enable_eager_mode'] = '1' + +import unittest +import numpy as np + +import paddle +from paddle.autograd import PyLayer +from paddle.distributed.fleet.utils import recompute +import random + +import paddle.fluid.layers as layers + + +def get_fc_block(block_idx, input_size, is_last=False): + block_name = "block_" + str(block_idx) + block = paddle.nn.Sequential( + (block_name + "_fc_0", + paddle.nn.Linear(input_size, input_size, bias_attr=False)), + (block_name + "_dropout", paddle.nn.Dropout(p=0.5)), + (block_name + "_relu_1", paddle.nn.ReLU()), + (block_name + "_fc_1", + paddle.nn.Linear(input_size, input_size, bias_attr=False)), + (block_name + "_relu_2", paddle.nn.ReLU()), + ) + if is_last: + block.add_sublayer(block_name + "_fc_2", + paddle.nn.Linear(input_size, 1, + bias_attr=False)) # add sublayer + else: + block.add_sublayer(block_name + "_fc_2", + paddle.nn.Linear(input_size, + input_size, + bias_attr=False)) # add sublayer + return block + + +class Naive_fc_net(paddle.nn.Layer): + + def __init__(self, + input_size=10, + recompute_blocks=[1, 3], + recompute_kwargs={}): + super(Naive_fc_net, self).__init__() + self.recompute_blocks = recompute_blocks + self.recompute_kwargs = recompute_kwargs + self.runfunc0 = get_fc_block(0, input_size, is_last=False) + self.runfunc1 = get_fc_block(1, input_size, is_last=False) + self.runfunc2 = get_fc_block(2, input_size, is_last=False) + self.runfunc3 = get_fc_block(3, input_size, is_last=False) + self.runfunc4 = get_fc_block(4, input_size, is_last=True) + + def forward(self, inputs): + + if 0 in self.recompute_blocks: + inputs = recompute(self.runfunc0, inputs) + else: + inputs = self.runfunc0(inputs) + + if 1 in self.recompute_blocks: + inputs = recompute(self.runfunc1, inputs) + else: + inputs = self.runfunc1(inputs) + + if 2 in self.recompute_blocks: + inputs = recompute(self.runfunc2, inputs, **self.recompute_kwargs) + else: + inputs = self.runfunc2(inputs) + + if 3 in self.recompute_blocks: + inputs = recompute(self.runfunc3, inputs) + else: + inputs = self.runfunc3(inputs) + + if 4 in self.recompute_blocks: + inputs = recompute(self.runfunc4, inputs) + else: + inputs = self.runfunc4(inputs) + + return inputs + + +def run_model(recompute_block=[], + recompute_kwargs={}, + enable_autocast=False, + pure_fp16=False): + gen = paddle.seed(10) + gen.manual_seed(10) + np.random.seed(10) + random.seed(10) + + batch_size, input_size = 1, 10 + model = Naive_fc_net(input_size, + recompute_blocks=recompute_block, + recompute_kwargs=recompute_kwargs) + loss_fn = paddle.nn.MSELoss(reduction='mean') + optimizer = paddle.optimizer.SGD(learning_rate=0.01, + parameters=model.parameters()) + + if enable_autocast: + scaler = paddle.amp.GradScaler() + + loss_ = [] + param_ = [] + grad_ = [] + for step in range(10): + + x_data = np.random.randn(batch_size, input_size).astype(np.float32) + x = paddle.to_tensor(x_data) + # x.stop_gradient = False + level = 'O2' if pure_fp16 else 'O1' + with paddle.amp.auto_cast(True, level=level): + y_pred = model(x) + loss = y_pred.mean() + if enable_autocast: + scaler.scale(loss).backward() + scaler.minimize(optimizer, loss) + else: + loss_.append(np.asarray(loss).tolist()) + loss.backward() + optimizer.step() + + param_.append(np.asarray(model.parameters()[9]).tolist()) + grad_.append(np.asarray(model.parameters()[3]._grad_ivar()).tolist()) + + optimizer.clear_grad() + return loss_, param_, grad_ + + +class TestPyLayer(unittest.TestCase): + + def test_base_case(self, enable_autocast=False, pure_fp16=False): + + def check_identical(loss_ref, param_ref, grad_ref, loss, param, grad): + self.assertEqual(loss_ref, loss) + self.assertEqual(param_ref, param) + self.assertEqual(grad_ref, grad) + + # without recompute + loss_ref, param_ref, grad_ref = run_model( + recompute_block=[], + enable_autocast=enable_autocast, + pure_fp16=pure_fp16) + + # recompute second block + loss, param, grad = run_model(recompute_block=[1], + enable_autocast=enable_autocast, + pure_fp16=pure_fp16) + check_identical(loss_ref, param_ref, grad_ref, loss, param, grad) + + # recompute fourth block + loss, param, grad = run_model(recompute_block=[3], + enable_autocast=enable_autocast, + pure_fp16=pure_fp16) + check_identical(loss_ref, param_ref, grad_ref, loss, param, grad) + + # recompute second to fourth block + loss, param, grad = run_model(recompute_block=[1, 2, 3], + enable_autocast=enable_autocast, + pure_fp16=pure_fp16) + check_identical(loss_ref, param_ref, grad_ref, loss, param, grad) + + # recompute second & fourth block + loss, param, grad = run_model(recompute_block=[1, 3], + enable_autocast=enable_autocast, + pure_fp16=pure_fp16) + check_identical(loss_ref, param_ref, grad_ref, loss, param, grad) + + def test_fc_net_with_dropout(self): + self.test_base_case() + + def test_fc_net_without_restore_rng(self): + loss_ref, param_ref, grad_ref = run_model( + recompute_block=[2], + recompute_kwargs={"preserve_rng_state": False}, + enable_autocast=True) + + def test_fc_net_with_amp(self): + self.test_base_case(enable_autocast=True) + + def test_fc_net_with_fp16(self): + self.test_base_case(enable_autocast=True, pure_fp16=True) + + def test_recompute_kwargs(self): + paddle.set_device("gpu") + kwargs = {"is_test": False} + with self.assertRaises(ValueError): + loss_ref, param_ref, grad_ref = run_model(recompute_block=[2], + recompute_kwargs=kwargs) + + def test_recompute_cpu_rng(self): + paddle.set_device("cpu") + with self.assertRaises(RuntimeError): + loss_ref, param_ref, grad_ref = run_model(recompute_block=[2]) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py index 6175634e70013b3e85dc61e2a71977182860d7a1..940d59816a30fabe183c8927269f3324a67ff804 100644 --- a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py +++ b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py @@ -15,6 +15,9 @@ from __future__ import print_function import os + +os.environ['FLAGS_enable_eager_mode'] = '0' + import unittest import paddle.fluid as fluid @@ -25,15 +28,12 @@ class TestDygraphShardingStage3(TestMultipleGpus): # check sharding logic as well as the accuracy with single mode def test_dygraph_sharding_stage3(self): - self.run_mnist_2gpu('dygraph_group_sharded_stage3.py') self.run_mnist_2gpu('dygraph_sharding_stage3.py', eager_mode=False) def test_dygraph_sharding_stage3_offload(self): - self.run_mnist_2gpu('dygraph_group_sharded_stage3_offload.py') self.run_mnist_2gpu('dygraph_sharding_stage3_offload.py', eager_mode=False) if __name__ == "__main__": - os.environ["FLAGS_enable_eager_mode"] = "1" unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3_for_eager.py b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3_for_eager.py new file mode 100644 index 0000000000000000000000000000000000000000..2326a0a16ef626f5e9876c9fd005bfb3d6ef4597 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3_for_eager.py @@ -0,0 +1,40 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os + +os.environ['FLAGS_enable_eager_mode'] = '1' + +import os +import unittest +import paddle.fluid as fluid + +from test_parallel_dygraph_dataparallel import TestMultipleGpus + + +class TestDygraphShardingStage3(TestMultipleGpus): + + # check sharding logic as well as the accuracy with single mode + def test_dygraph_sharding_stage3(self): + self.run_mnist_2gpu('dygraph_group_sharded_stage3.py') + + def test_dygraph_sharding_stage3_offload(self): + self.run_mnist_2gpu('dygraph_group_sharded_stage3_offload.py') + + +if __name__ == "__main__": + os.environ["FLAGS_enable_eager_mode"] = "1" + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py index 6a5ddd3157bfb95fdf0af89fc2531d6369e92c5b..9649e9c68eda290d7c4c6322077d67cb7d9c60aa 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py @@ -12,6 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os + +os.environ['FLAGS_enable_eager_mode'] = '0' + import unittest import paddle import paddle.fluid as fluid @@ -19,13 +23,11 @@ import paddle.fluid.core as core import numpy as np import six import cv2 -import os import tempfile from test_imperative_resnet import ResNet, BottleneckBlock, ConvBNLayer, train_parameters, optimizer_setting import paddle.nn as nn from paddle.static import InputSpec from paddle.autograd import PyLayer -from paddle.fluid.framework import _test_eager_guard if fluid.core.is_compiled_with_cuda(): fluid.set_flags({"FLAGS_cudnn_deterministic": True}) @@ -73,8 +75,6 @@ class TestAutoCast(unittest.TestCase): self.assertTrue(out_fp32.dtype == fluid.core.VarDesc.VarType.FP32) def test_amp_guard_white_op(self): - with _test_eager_guard(): - self.amp_guard_white_op() self.amp_guard_white_op() def amp_guard_black_op(self): @@ -88,8 +88,6 @@ class TestAutoCast(unittest.TestCase): self.assertTrue(out_fp32.dtype == fluid.core.VarDesc.VarType.FP32) def test_amp_guard_black_op(self): - with _test_eager_guard(): - self.amp_guard_black_op() self.amp_guard_black_op() def custom_op_list(self): @@ -123,8 +121,6 @@ class TestAutoCast(unittest.TestCase): | {"conv2d"}) def test_custom_op_list(self): - with _test_eager_guard(): - self.custom_op_list() self.custom_op_list() def custom_op_list_exception(self): @@ -145,8 +141,6 @@ class TestAutoCast(unittest.TestCase): self.assertRaises(ValueError, func) def test_custom_op_list_exception(self): - with _test_eager_guard(): - self.custom_op_list_exception() self.custom_op_list_exception() def amp_guard_upsupported_fp16_op(self): @@ -174,8 +168,6 @@ class TestAutoCast(unittest.TestCase): out_purefp16_fp32.dtype == fluid.core.VarDesc.VarType.FP32) def test_amp_guard_upsupported_fp16_op(self): - with _test_eager_guard(): - self.amp_guard_upsupported_fp16_op() self.amp_guard_upsupported_fp16_op() def mode_exception(self): @@ -195,8 +187,6 @@ class TestAutoCast(unittest.TestCase): self.assertRaises(ValueError, func) def test_mode_exception(self): - with _test_eager_guard(): - self.mode_exception() self.mode_exception() @@ -212,8 +202,6 @@ class TestAmpScaler(unittest.TestCase): data.numpy() * 1024), True) def test_scale(self): - with _test_eager_guard(): - self.scale() self.scale() def minimize(self): @@ -265,8 +253,6 @@ class TestAmpScaler(unittest.TestCase): outs_no_scaler[1][i][0].numpy()), True) def test_minimize(self): - with _test_eager_guard(): - self.minimize() self.minimize() def step(self): @@ -310,8 +296,6 @@ class TestAmpScaler(unittest.TestCase): outs_no_scaler[i].numpy()), True) def test_step(self): - with _test_eager_guard(): - self.step() self.step() def nan_inf(self): @@ -344,8 +328,6 @@ class TestAmpScaler(unittest.TestCase): np.array_equal(param.numpy(), params_init[param.name])) def test_nan_inf(self): - with _test_eager_guard(): - self.nan_inf() self.nan_inf() def step_update_exception(self): @@ -396,8 +378,6 @@ class TestAmpScaler(unittest.TestCase): self.assertRaises(RuntimeError, func3) def test_step_update_exception(self): - with _test_eager_guard(): - self.step_update_exception() self.step_update_exception() def test_get_and_set(self): @@ -578,8 +558,6 @@ class TestGradScalerStateDict(unittest.TestCase): self.assertTrue( np.allclose(out_use_state_dict[0], out_no_state_dict[0])) - with _test_eager_guard(): - func_isinstance() func_isinstance() @@ -742,8 +720,6 @@ class TestStateDictHookForAMP(unittest.TestCase): for key in param_value_ori.keys(): print(np.equal(param_value_ori[key], param_value_now[key])) - with _test_eager_guard(): - func_isinstance() func_isinstance() @@ -899,8 +875,6 @@ class TestPureFp16SaveLoad(unittest.TestCase): self.assertTrue( np.allclose(out_use_save_load[0], out_no_save_load[0])) - with _test_eager_guard(): - func_isinstance() func_isinstance() @@ -1005,8 +979,6 @@ class TestPureFp16InferenceSaveLoad(unittest.TestCase): def test_inference_save_load(self): self.inference_save_load() - with _test_eager_guard(): - self.inference_save_load() class TestResnet2(unittest.TestCase): @@ -1146,8 +1118,6 @@ class TestResnet2(unittest.TestCase): self.assertTrue( np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-2)) - with _test_eager_guard(): - func_isinstance() func_isinstance() def test_with_data_loader(self): @@ -1166,8 +1136,6 @@ class TestResnet2(unittest.TestCase): self.assertTrue( np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-2)) - with _test_eager_guard(): - func_isinstance() func_isinstance() def test_param_group(self): @@ -1189,8 +1157,6 @@ class TestResnet2(unittest.TestCase): self.assertTrue( np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-2)) - with _test_eager_guard(): - func_isinstance() func_isinstance() @@ -1285,8 +1251,6 @@ class TestResnet(unittest.TestCase): self.assertTrue( np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-1)) - with _test_eager_guard(): - func_isinstance() func_isinstance() @@ -1308,8 +1272,6 @@ class TestLayerNormFp16(unittest.TestCase): self.assertTrue( out.dtype == fluid.core.VarDesc.VarType.FP16) - with _test_eager_guard(): - func_isinstance() func_isinstance() @@ -1344,8 +1306,6 @@ class TestBf16(unittest.TestCase): self.assertTrue( np.allclose(out_fp32, out_bf16_O2, rtol=1.e-3, atol=1.e-1)) - with _test_eager_guard(): - func_isinstance() func_isinstance() @@ -1399,8 +1359,6 @@ class TestAmpWithHook(unittest.TestCase): loss = a.sum() self.assertRaises(RuntimeError, loss.backward) - with _test_eager_guard(): - func_isinstance() func_isinstance() def test_hook_change_place(self): @@ -1420,8 +1378,6 @@ class TestAmpWithHook(unittest.TestCase): loss = a.sum() self.assertRaises(RuntimeError, loss.backward) - with _test_eager_guard(): - func_isinstance() func_isinstance() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision_for_eager.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision_for_eager.py new file mode 100644 index 0000000000000000000000000000000000000000..d12b002f04ef874a2c0f7ec537292736c741c9dc --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision_for_eager.py @@ -0,0 +1,1374 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +os.environ['FLAGS_enable_eager_mode'] = '1' + +import unittest +import paddle +import paddle.fluid as fluid +import numpy as np +import six +import cv2 +import tempfile +from test_imperative_resnet import ResNet, BottleneckBlock, ConvBNLayer, train_parameters, optimizer_setting +import paddle.nn as nn +from paddle.static import InputSpec +from paddle.autograd import PyLayer + +if fluid.core.is_compiled_with_cuda(): + fluid.set_flags({"FLAGS_cudnn_deterministic": True}) + + +class SimpleConv(fluid.dygraph.Layer): + + def __init__(self, + num_channels, + num_filters, + filter_size, + stride=1, + groups=1, + act=None): + super(SimpleConv, self).__init__() + self._conv = fluid.dygraph.Conv2D(num_channels=num_channels, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + act=None, + bias_attr=None, + use_cudnn=True) + + def forward(self, inputs): + return self._conv(inputs) + + +class TestAutoCast(unittest.TestCase): + + def amp_guard_white_op(self): + data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') + with fluid.dygraph.guard(): + conv2d = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None) + data = fluid.dygraph.to_variable(data) + with fluid.dygraph.amp_guard(True): + out_fp16 = conv2d(data) + + with fluid.dygraph.amp_guard(False): + out_fp32 = conv2d(data) + + self.assertTrue(data.dtype == fluid.core.VarDesc.VarType.FP32) + self.assertTrue(out_fp16.dtype == fluid.core.VarDesc.VarType.FP16) + self.assertTrue(out_fp32.dtype == fluid.core.VarDesc.VarType.FP32) + + def test_amp_guard_white_op(self): + self.amp_guard_white_op() + + def amp_guard_black_op(self): + data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') + with fluid.dygraph.guard(): + data = fluid.dygraph.to_variable(data) + with fluid.dygraph.amp_guard(True): + out_fp32 = fluid.layers.mean(data) + + self.assertTrue(data.dtype == fluid.core.VarDesc.VarType.FP32) + self.assertTrue(out_fp32.dtype == fluid.core.VarDesc.VarType.FP32) + + def test_amp_guard_black_op(self): + self.amp_guard_black_op() + + def custom_op_list(self): + with fluid.dygraph.guard(): + tracer = fluid.framework._dygraph_tracer() + base_white_list = fluid.dygraph.amp.auto_cast.WHITE_LIST + base_black_list = fluid.dygraph.amp.auto_cast.BLACK_LIST + with fluid.dygraph.amp_guard(custom_white_list=["log"], + custom_black_list=["conv2d"]): + white_list, black_list = tracer._get_amp_op_list() + self.assertTrue( + set(white_list) == (set(base_white_list) | {"log"}) - + {"conv2d"}) + + self.assertTrue( + set(black_list) == (set(base_black_list) - {"log"}) + | {"conv2d"}) + + base_white_list = fluid.dygraph.amp.auto_cast.PURE_FP16_WHITE_LIST + base_black_list = fluid.dygraph.amp.auto_cast.PURE_FP16_BLACK_LIST + with fluid.dygraph.amp_guard(custom_white_list=["log"], + custom_black_list=["conv2d"], + level='O2'): + white_list, black_list = tracer._get_amp_op_list() + self.assertTrue( + set(white_list) == (set(base_white_list) | {"log"}) - + {"conv2d"}) + + self.assertTrue( + set(black_list) == (set(base_black_list) - {"log"}) + | {"conv2d"}) + + def test_custom_op_list(self): + self.custom_op_list() + + def custom_op_list_exception(self): + inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32) + + def func(): + with fluid.dygraph.guard(): + model = SimpleConv(num_channels=3, + num_filters=64, + filter_size=7, + stride=2, + act='relu') + with fluid.dygraph.amp_guard(custom_white_list=["conv2d"], + custom_black_list=["conv2d"]): + inp = fluid.dygraph.to_variable(inp_np) + out = model(inp) + + self.assertRaises(ValueError, func) + + def test_custom_op_list_exception(self): + self.custom_op_list_exception() + + def amp_guard_upsupported_fp16_op(self): + data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') + with fluid.dygraph.guard(): + conv2d = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None) + data = fluid.dygraph.to_variable(data) + with fluid.dygraph.amp_guard(True): + out_amp_fp16 = conv2d(data) + out_amp_fp32 = paddle.expand_as( + out_amp_fp16, + out_amp_fp16) # expand_as_v2 has no fp16 kernel + + with fluid.dygraph.amp_guard(True, level='O2'): + out_purefp16_fp16 = conv2d(data) + out_purefp16_fp32 = paddle.expand_as( + out_purefp16_fp16, + out_purefp16_fp16) # expand_as_v2 has no fp16 kernel + self.assertTrue(data.dtype == fluid.core.VarDesc.VarType.FP32) + self.assertTrue(out_amp_fp16.dtype == fluid.core.VarDesc.VarType.FP16) + self.assertTrue(out_amp_fp32.dtype == fluid.core.VarDesc.VarType.FP32) + self.assertTrue( + out_purefp16_fp16.dtype == fluid.core.VarDesc.VarType.FP16) + self.assertTrue( + out_purefp16_fp32.dtype == fluid.core.VarDesc.VarType.FP32) + + def test_amp_guard_upsupported_fp16_op(self): + self.amp_guard_upsupported_fp16_op() + + def mode_exception(self): + + def func(): + data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') + with fluid.dygraph.guard(): + conv2d = fluid.dygraph.Conv2D(3, + 2, + 3, + bias_attr=False, + act=None) + data = fluid.dygraph.to_variable(data) + with fluid.dygraph.amp_guard(level='O'): + out = conv2d(data) + + self.assertRaises(ValueError, func) + + def test_mode_exception(self): + self.mode_exception() + + +class TestAmpScaler(unittest.TestCase): + + def scale(self): + with fluid.dygraph.guard(): + data = paddle.rand([10, 1024]) + scaler = paddle.fluid.dygraph.AmpScaler(init_loss_scaling=1024) + scaled_data = scaler.scale(data) + self.assertEqual( + np.array_equal(scaled_data.numpy(), + data.numpy() * 1024), True) + + def test_scale(self): + self.scale() + + def minimize(self): + inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32) + + def run_simple_conv(inp_np, use_scaler=True): + paddle.seed(10) + paddle.framework.random._manual_program_seed(10) + with fluid.dygraph.guard(): + model = SimpleConv(num_channels=3, + num_filters=64, + filter_size=7, + stride=2, + act='relu') + optimizer = fluid.optimizer.SGDOptimizer( + learning_rate=0.01, parameter_list=model.parameters()) + scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024) + data = fluid.dygraph.to_variable(inp_np) + + out = model(data) + loss = fluid.layers.mean(out) + if use_scaler: + print('use scaler') + scaled_loss = scaler.scale(loss) + scaled_loss.backward() + optimize_ops, params_grads = scaler.minimize( + optimizer, scaled_loss) + else: + print('use no scaler') + loss.backward() + optimize_ops, params_grads = optimizer.minimize(loss) + return optimize_ops, params_grads + + outs_with_scaler = run_simple_conv(inp_np, use_scaler=True) + outs_no_scaler = run_simple_conv(inp_np, use_scaler=False) + + self.assertEqual(outs_with_scaler[0], + []) # optimize_ops is [] in dygraph mode + self.assertEqual(outs_no_scaler[0], + []) # optimize_ops is [] in dygraph mode + for i in range(len(outs_with_scaler[1])): + # check each grad + self.assertEqual( + np.allclose(outs_with_scaler[1][i][1].numpy(), + outs_no_scaler[1][i][1].numpy()), True) + # check each parameter + self.assertEqual( + np.allclose(outs_with_scaler[1][i][0].numpy(), + outs_no_scaler[1][i][0].numpy()), True) + + def test_minimize(self): + self.minimize() + + def step(self): + inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32) + + def run_simple_conv(inp_np, use_scaler=True): + paddle.seed(10) + paddle.framework.random._manual_program_seed(10) + with fluid.dygraph.guard(): + model = SimpleConv(num_channels=3, + num_filters=64, + filter_size=7, + stride=2, + act='relu') + optimizer = paddle.optimizer.SGD(learning_rate=0.01, + parameters=model.parameters()) + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + data = fluid.dygraph.to_variable(inp_np) + + out = model(data) + loss = fluid.layers.mean(out) + if use_scaler: + print('use scaler') + scaled_loss = scaler.scale(loss) + scaled_loss.backward() + scaler.step(optimizer) + scaler.update() + else: + print('use no scaler') + loss.backward() + optimizer.step() + return optimizer._parameter_list + + outs_with_scaler = run_simple_conv(inp_np, use_scaler=True) + outs_no_scaler = run_simple_conv(inp_np, use_scaler=False) + + for i in range(len(outs_with_scaler)): + # check each parameter + self.assertEqual( + np.allclose(outs_with_scaler[i].numpy(), + outs_no_scaler[i].numpy()), True) + + def test_step(self): + self.step() + + def nan_inf(self): + inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32) + inp_np[0][1][2][3] = np.nan + with fluid.dygraph.guard(): + model = SimpleConv(num_channels=3, + num_filters=64, + filter_size=7, + stride=2, + act='relu') + params_init = {} + for param in model.parameters(): + params_init[param.name] = param.numpy() + optimizer = fluid.optimizer.SGDOptimizer( + learning_rate=0.01, parameter_list=model.parameters()) + scaler = fluid.dygraph.AmpScaler(init_loss_scaling=1024) + data = fluid.dygraph.to_variable(inp_np) + + out = model(data) + loss = fluid.layers.mean(out) + scaled_loss = scaler.scale(loss) + scaled_loss.backward() + optimize_ops, params_grads = scaler.minimize(optimizer, scaled_loss) + self.assertEqual(scaler._found_inf.numpy() == 1, True) + + for param in model.parameters(): + # param not update when tensor contains nan or inf + self.assertTrue( + np.array_equal(param.numpy(), params_init[param.name])) + + def test_nan_inf(self): + self.nan_inf() + + def step_update_exception(self): + + def func1(): + model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True) + optimizer = paddle.optimizer.SGD(learning_rate=0.01, + parameters=model.parameters()) + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + data = paddle.rand([10, 3, 32, 32]) + conv = model(data) + loss = paddle.mean(conv) + scaled = scaler.scale(loss) + scaled.backward() + scaler.unscale_(optimizer) + scaler.unscale_(optimizer) + + self.assertRaises(RuntimeError, func1) + + def func2(): + model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True) + optimizer = paddle.optimizer.SGD(learning_rate=0.01, + parameters=model.parameters()) + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + data = paddle.rand([10, 3, 32, 32]) + conv = model(data) + loss = paddle.mean(conv) + scaled = scaler.scale(loss) + scaled.backward() + scaler.step(optimizer) + scaler.unscale_(optimizer) + + self.assertRaises(RuntimeError, func2) + + def func3(): + model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True) + optimizer = paddle.optimizer.SGD(learning_rate=0.01, + parameters=model.parameters()) + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + data = paddle.rand([10, 3, 32, 32]) + conv = model(data) + loss = paddle.mean(conv) + scaled = scaler.scale(loss) + scaled.backward() + scaler.step(optimizer) + scaler.step(optimizer) + + self.assertRaises(RuntimeError, func3) + + def test_step_update_exception(self): + self.step_update_exception() + + def test_get_and_set(self): + with fluid.dygraph.guard(): + scaler = paddle.amp.GradScaler(enable=True, + init_loss_scaling=1024, + incr_ratio=2.0, + decr_ratio=0.5, + incr_every_n_steps=1000, + decr_every_n_nan_or_inf=2, + use_dynamic_loss_scaling=True) + self.assertEqual(scaler.is_enable() == True, True) + self.assertEqual(scaler.get_init_loss_scaling() == 1024, True) + self.assertEqual(scaler.get_incr_ratio() == 2.0, True) + self.assertEqual(scaler.get_decr_ratio() == 0.5, True) + self.assertEqual(scaler.get_incr_every_n_steps() == 1000, True) + self.assertEqual(scaler.get_decr_every_n_nan_or_inf() == 2, True) + self.assertEqual(scaler.is_use_dynamic_loss_scaling() == True, True) + scaler.set_decr_every_n_nan_or_inf(4) + self.assertEqual(scaler.get_decr_every_n_nan_or_inf() == 4, True) + scaler.set_decr_ratio(0.1) + self.assertEqual(scaler.get_decr_ratio() == 0.1, True) + scaler.set_incr_every_n_steps(200) + self.assertEqual(scaler.get_incr_every_n_steps() == 200, True) + scaler.set_incr_ratio(3.0) + self.assertEqual(scaler.get_incr_ratio() == 3.0, True) + scaler.set_init_loss_scaling(100) + self.assertEqual(scaler.get_init_loss_scaling() == 100, True) + + def test_state_dict_and_load_state_dict(self): + with fluid.dygraph.guard(): + scaler1 = paddle.amp.GradScaler(enable=True, + init_loss_scaling=14, + incr_ratio=233.0, + decr_ratio=0.523, + incr_every_n_steps=1090, + decr_every_n_nan_or_inf=20, + use_dynamic_loss_scaling=True) + scaler_state = scaler1.state_dict() + scaler2 = paddle.amp.GradScaler(enable=True) + scaler2.load_state_dict(scaler_state) + self.assertEqual(scaler2.get_init_loss_scaling() == 14, True) + self.assertEqual(scaler2.get_incr_ratio() == 233.0, True) + self.assertEqual(scaler2.get_decr_ratio() == 0.523, True) + self.assertEqual(scaler2.get_incr_every_n_steps() == 1090, True) + self.assertEqual(scaler2.get_decr_every_n_nan_or_inf() == 20, True) + + scaler3 = paddle.amp.GradScaler(enable=False) + scaler3.load_state_dict(scaler_state) + self.assertEqual(scaler3.is_enable() == False, True) + + def test_state_dict_and_load_state_dict_error(self): + + def test_error(): + state_empty = {} + scaler = paddle.amp.GradScaler(enable=True) + scaler.load_state_dict(state_empty) + + self.assertRaises(RuntimeError, test_error) + + +def reader_decorator(reader): + + def __reader__(): + for item in reader(): + img = np.array(item[0]).astype('float32').reshape(3, 224, 224) + label = np.array(item[1]).astype('int64').reshape(1) + yield img, label + + return __reader__ + + +class TestGradScalerStateDict(unittest.TestCase): + + def train_resnet(self, + enable_amp=True, + use_data_loader=True, + use_save_load=True): + seed = 90 + + batch_size = train_parameters["batch_size"] + batch_num = 4 + + paddle.seed(seed) + paddle.framework.random._manual_program_seed(seed) + + resnet = ResNet(use_cudnn=True) + optimizer = optimizer_setting(train_parameters, + parameter_list=resnet.parameters()) + np.random.seed(seed) + train_reader = paddle.batch( + paddle.dataset.flowers.train(use_xmap=False), batch_size=batch_size) + + dy_param_init_value = {} + for param in resnet.parameters(): + dy_param_init_value[param.name] = param.numpy() + + program = None + scaler = paddle.amp.GradScaler(enable=enable_amp, + init_loss_scaling=2.**10) + + if use_data_loader: + train_reader = paddle.batch(reader_decorator( + paddle.dataset.flowers.train(use_xmap=False)), + batch_size=batch_size, + drop_last=True) + train_loader = fluid.io.DataLoader.from_generator( + capacity=4, + use_double_buffer=True, + iterable=True, + return_list=True) + train_loader.set_sample_list_generator(train_reader) + train_reader = train_loader + + for batch_id, data in enumerate(train_reader()): + if batch_id >= batch_num: + break + if use_data_loader: + img, label = data + else: + dy_x_data = np.array([x[0].reshape(3, 224, 224) + for x in data]).astype('float32') + if len(np.array([x[1] + for x in data]).astype('int64')) != batch_size: + continue + y_data = np.array([x[1] for x in data + ]).astype('int64').reshape(-1, 1) + + img = paddle.to_tensor(dy_x_data) + label = paddle.to_tensor(y_data) + label.stop_gradient = True + + with paddle.amp.auto_cast(enable=enable_amp): + out = resnet(img) + + loss = paddle.nn.functional.cross_entropy(input=out, label=label) + avg_loss = paddle.mean(x=loss) + + dy_out = avg_loss.numpy() + + scaled_loss = scaler.scale(avg_loss) + scaled_loss.backward() + + scaler.minimize(optimizer, scaled_loss) + + dy_grad_value = {} + for param in resnet.parameters(): + if param.trainable: + np_array = np.array(param._grad_ivar().value().get_tensor()) + dy_grad_value[param.name + + fluid.core.grad_var_suffix()] = np_array + + resnet.clear_gradients() + + dy_param_value = {} + for param in resnet.parameters(): + dy_param_value[param.name] = param.numpy() + + if use_save_load and batch_id == 2: + paddle.save(scaler.state_dict(), 'ResNet_model.pdparams') + dict_load = paddle.load('ResNet_model.pdparams') + scaler.load_state_dict(dict_load) + if use_data_loader: + train_reader._reset() + return dy_out, dy_param_value, dy_grad_value + + def test_with_state_dict(self): + + def func_isinstance(): + with fluid.dygraph.guard(): + out_use_state_dict = self.train_resnet(enable_amp=True, + use_data_loader=True, + use_save_load=True) + out_no_state_dict = self.train_resnet(enable_amp=True, + use_data_loader=True, + use_save_load=False) + print('save_load:', out_use_state_dict[0], out_no_state_dict[0]) + self.assertTrue( + np.allclose(out_use_state_dict[0], out_no_state_dict[0])) + + func_isinstance() + + +class TestAmpDecorator(unittest.TestCase): + + def test_mode_exception(self): + + def func(): + with fluid.dygraph.guard(): + model = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None) + opt = paddle.optimizer.SGD(parameters=model.parameters()) + model, opt = paddle.amp.decorate(models=model, + optimizers=opt, + level='O') + + self.assertRaises(ValueError, func) + + def test_input_type_exception(self): + + def test_error_model(): + + class MyModel(object): + + def __init__(self): + print("A fake Model") + + model = MyModel() + with fluid.dygraph.guard(): + paddle.amp.decorate(models=model, optimizers=None, level='O2') + + self.assertRaises(TypeError, test_error_model) + + def test_error_distributed_model(): + model = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None) + model = paddle.DataParallel(model) + with fluid.dygraph.guard(): + model = paddle.amp.decorate(models=model, level='O2') + + self.assertRaises(RuntimeError, test_error_distributed_model) + + def test_error_optimizer(): + + class MyOptimizer(object): + + def __init__(self): + print("A fake Optimizer") + + model = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None) + opt = MyOptimizer() + with fluid.dygraph.guard(): + paddle.amp.decorate(models=model, optimizers=opt, level='O2') + + self.assertRaises(TypeError, test_error_optimizer) + + def test_set_master_weight(self): + model1 = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None) + opt1 = paddle.optimizer.Adam(learning_rate=0.0001, + parameters=model1.parameters(), + multi_precision=True) + + model2 = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None) + opt2 = paddle.optimizer.Adam(learning_rate=0.0001, + parameters=model2.parameters(), + multi_precision=False) + + model1, opt1 = paddle.amp.decorate(models=model1, + optimizers=opt1, + level='O2', + master_weight=None) + self.assertEqual(opt1._multi_precision, True) + + models, opt2 = paddle.amp.decorate(models=[model1, model2], + optimizers=opt2, + level='O2', + master_weight=None) + self.assertEqual(opt2._multi_precision, True) + + model3 = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None) + opt3 = paddle.optimizer.Adam(learning_rate=0.0001, + parameters=model3.parameters()) + + model4 = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None) + opt4 = paddle.optimizer.Adam(learning_rate=0.0001, + parameters=model4.parameters()) + + model3, opts = paddle.amp.decorate(models=model3, + optimizers=[opt3, opt4], + level='O2', + master_weight=True) + self.assertEqual(opts[0]._multi_precision, True) + self.assertEqual(opts[1]._multi_precision, True) + + models = [model3, model4] + optimizers = [opt3, opt4] + models, optimizers = paddle.amp.decorate(models=models, + optimizers=optimizers, + level='O2', + master_weight=False) + self.assertEqual(optimizers[0]._multi_precision, False) + self.assertEqual(optimizers[1]._multi_precision, False) + + def test_skip_BatchNorm_Layer_norm(self): + model = paddle.nn.LayerNorm(1) + model = paddle.amp.decorate(models=model, level='O2') + for param in model.parameters(): + self.assertEqual((param.dtype == paddle.float32), True) + + model = paddle.nn.BatchNorm(1) + model = paddle.amp.decorate(models=model, level='O2') + for param in model.parameters(): + self.assertEqual((param.dtype == paddle.float32), True) + + model = paddle.nn.BatchNorm1D(1) + model = paddle.amp.decorate(models=model, level='O2') + for param in model.parameters(): + self.assertEqual((param.dtype == paddle.float32), True) + + model = paddle.nn.BatchNorm2D(1) + model = paddle.amp.decorate(models=model, level='O2') + for param in model.parameters(): + self.assertEqual((param.dtype == paddle.float32), True) + + model = paddle.nn.BatchNorm3D(1) + model = paddle.amp.decorate(models=model, level='O2') + for param in model.parameters(): + self.assertEqual((param.dtype == paddle.float32), True) + + +class TestStateDictHookForAMP(unittest.TestCase): + + def test_state_dict_hook(self): + + def func_isinstance(): + paddle.seed(100) + model = paddle.nn.Linear(2, 4) + model = paddle.amp.decorate(models=model, + level='O2', + save_dtype='float32') + param_value_ori = {} + for param in model.parameters(): + param_value_ori[param.name] = param.numpy() + + state_dict = model.state_dict() + for key, value in state_dict.items(): + state_dict[key] = value.cast("float16") + model.set_state_dict(state_dict) + + param_value_now = {} + for param in model.parameters(): + param_value_now[param.name] = param.numpy() + + for key in param_value_ori.keys(): + print(np.equal(param_value_ori[key], param_value_now[key])) + + func_isinstance() + + +class TestPureFp16SaveLoad(unittest.TestCase): + + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + + def tearDown(self): + self.temp_dir.cleanup() + + def test_save_dtype_exception(self): + + def func(): + paddle.disable_static() + model = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None) + opt = paddle.optimizer.SGD(parameters=model.parameters()) + paddle.amp.decorate(models=model, + optimizers=opt, + level='O2', + save_dtype='int') + + self.assertRaises(ValueError, func) + + def train_resnet(self, + enable_amp=True, + use_data_loader=True, + use_save_load=True): + seed = 90 + + batch_size = train_parameters["batch_size"] + batch_num = 4 + + paddle.seed(seed) + paddle.framework.random._manual_program_seed(seed) + + resnet = ResNet(use_cudnn=True) + optimizer = optimizer_setting(train_parameters, + parameter_list=resnet.parameters()) + np.random.seed(seed) + train_reader = paddle.batch( + paddle.dataset.flowers.train(use_xmap=False), batch_size=batch_size) + + dy_param_init_value = {} + for param in resnet.parameters(): + dy_param_init_value[param.name] = param.numpy() + + program = None + scaler = paddle.amp.GradScaler(enable=enable_amp, + init_loss_scaling=2.**10) + + if use_data_loader: + train_reader = paddle.batch(reader_decorator( + paddle.dataset.flowers.train(use_xmap=False)), + batch_size=batch_size, + drop_last=True) + train_loader = fluid.io.DataLoader.from_generator( + capacity=4, + use_double_buffer=True, + iterable=True, + return_list=True) + train_loader.set_sample_list_generator(train_reader) + train_reader = train_loader + + if enable_amp: + resnet, optimizer = paddle.amp.decorate(models=resnet, + optimizers=optimizer, + level='O2', + save_dtype='float32') + + for batch_id, data in enumerate(train_reader()): + if batch_id >= batch_num: + break + if use_data_loader: + img, label = data + else: + dy_x_data = np.array([x[0].reshape(3, 224, 224) + for x in data]).astype('float32') + if len(np.array([x[1] + for x in data]).astype('int64')) != batch_size: + continue + y_data = np.array([x[1] for x in data + ]).astype('int64').reshape(-1, 1) + + img = paddle.to_tensor(dy_x_data) + label = paddle.to_tensor(y_data) + label.stop_gradient = True + + with paddle.amp.auto_cast(enable=enable_amp, level='O2'): + out = resnet(img) + + loss = paddle.nn.functional.cross_entropy(input=out, label=label) + loss = paddle.cast(loss, 'float32') + avg_loss = paddle.mean(x=loss) + + dy_out = avg_loss.numpy() + + scaled_loss = scaler.scale(avg_loss) + scaled_loss.backward() + + scaler.minimize(optimizer, scaled_loss) + + dy_grad_value = {} + for param in resnet.parameters(): + if param.trainable: + np_array = np.array(param._grad_ivar().value().get_tensor()) + dy_grad_value[param.name + + fluid.core.grad_var_suffix()] = np_array + + resnet.clear_gradients() + + dy_param_value = {} + for param in resnet.parameters(): + dy_param_value[param.name] = param.numpy() + + if use_save_load and batch_id == 2: + # paddle.save + obj = { + 'model': resnet.state_dict(), + 'opt': optimizer.state_dict(), + 'scaler': scaler.state_dict() + } + path = os.path.join(self.temp_dir.name, 'model.pdparams') + paddle.save(obj, path) + # paddle.load + obj_load = paddle.load(path) + resnet = ResNet(use_cudnn=True) + optimizer = optimizer_setting( + train_parameters, parameter_list=resnet.parameters()) + resnet.set_state_dict(obj_load['model']) + optimizer.set_state_dict(obj_load['opt']) + scaler.load_state_dict(obj_load['scaler']) + resnet, optimizer = paddle.amp.decorate(models=resnet, + optimizers=optimizer, + level='O2', + save_dtype='float32') + + if use_data_loader: + train_reader._reset() + return dy_out, dy_param_value, dy_grad_value + + def test_with_save_load(self): + + def func_isinstance(): + with fluid.dygraph.guard(): + out_use_save_load = self.train_resnet(enable_amp=True, + use_data_loader=True, + use_save_load=True) + out_no_save_load = self.train_resnet(enable_amp=True, + use_data_loader=True, + use_save_load=False) + print('save_load:', out_use_save_load[0], out_no_save_load[0]) + self.assertTrue( + np.allclose(out_use_save_load[0], out_no_save_load[0])) + + func_isinstance() + + +class TestPureFp16InferenceSaveLoad(unittest.TestCase): + + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + + def tearDown(self): + self.temp_dir.cleanup() + + def inference_save_load(self): + BATCH_SIZE = 16 + BATCH_NUM = 4 + EPOCH_NUM = 4 + IMAGE_SIZE = 784 + CLASS_NUM = 10 + + # define a random dataset + class RandomDataset(paddle.io.Dataset): + + def __init__(self, num_samples): + self.num_samples = num_samples + + def __getitem__(self, idx): + image = np.random.random([IMAGE_SIZE]).astype('float32') + label = np.random.randint(0, CLASS_NUM - 1, + (1, )).astype('int64') + return image, label + + def __len__(self): + return self.num_samples + + class LinearNet(nn.Layer): + + def __init__(self): + super(LinearNet, self).__init__() + self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM) + + def forward(self, x): + return self._linear(x) + + def train(layer, loader, loss_fn, opt): + for epoch_id in range(EPOCH_NUM): + for batch_id, (image, label) in enumerate(loader()): + with paddle.amp.auto_cast(enable=True, + custom_white_list=None, + custom_black_list=None, + level='O2'): + out = layer(image) + loss = loss_fn(out, label) + loss.backward() + opt.step() + opt.clear_grad() + + # train + layer = LinearNet() + adam = paddle.optimizer.Adam(learning_rate=0.001, + parameters=layer.parameters(), + multi_precision=True) + loss_fn = nn.CrossEntropyLoss() + layer, adam = paddle.amp.decorate(models=layer, + optimizers=adam, + save_dtype='float32') + dataset = RandomDataset(BATCH_NUM * BATCH_SIZE) + loader = paddle.io.DataLoader(dataset, + batch_size=BATCH_SIZE, + shuffle=True, + drop_last=True, + num_workers=2) + + train(layer, loader, loss_fn, adam) + + # save + path = os.path.join(self.temp_dir.name, 'example_model/linear') + paddle.jit.save(layer, + path, + input_spec=[InputSpec(shape=[IMAGE_SIZE], name='x')]) + + # jit.load + loaded_layer = paddle.jit.load(path) + + # inference + loaded_layer.eval() + x = np.random.randn(1, IMAGE_SIZE).astype('float32') + x_tensor = paddle.to_tensor(x) + pred = loaded_layer(x_tensor) + + # load_inference_model + paddle.enable_static() + exe = paddle.static.Executor() + [inference_program, feed_target_names, + fetch_targets] = (paddle.static.load_inference_model(path, exe)) + tensor_img = x + results = exe.run(inference_program, + feed={feed_target_names[0]: tensor_img}, + fetch_list=fetch_targets) + print("pred.numpy()", pred.numpy()) + print("result", results[0]) + self.assertTrue(np.array_equal(pred.numpy(), results[0])) + paddle.disable_static() + + def test_inference_save_load(self): + self.inference_save_load() + + +class TestResnet2(unittest.TestCase): + """ + Use paddle-2.0 API + """ + + def train_resnet(self, + enable_amp=True, + level='O1', + use_data_loader=False, + use_param_group=False): + seed = 90 + + batch_size = train_parameters["batch_size"] + batch_num = 10 + + paddle.seed(seed) + paddle.framework.random._manual_program_seed(seed) + + resnet = ResNet(use_cudnn=True) + + if use_param_group: + conv_params = resnet.conv.parameters() + other_params = [] + for p in resnet.parameters(): + contains = False + for q in conv_params: + if p is q: + contains = True + if not contains: + other_params.append(p) + # NOTE(zhiqiu): The Membership test operations(in / not in) calls "is" and "equal", + # see details: https://docs.python.org/3/reference/expressions.html#membership-test-operations. + # So do not use other_params = [p for p in resnet.parameters() if p not in conv_params] + optimizer = paddle.optimizer.Momentum(parameters=[{ + 'params': + conv_params, + 'learning_rate': + 0.01 + }, { + 'params': + other_params, + 'learning_rate': + 0.001 + }], + multi_precision=True) + else: + optimizer = paddle.optimizer.SGD(parameters=resnet.parameters()) + + np.random.seed(seed) + train_reader = paddle.batch( + paddle.dataset.flowers.train(use_xmap=False), batch_size=batch_size) + + dy_param_init_value = {} + for param in resnet.parameters(): + dy_param_init_value[param.name] = param.numpy() + + program = None + scaler = paddle.amp.GradScaler(enable=enable_amp, + init_loss_scaling=2.**10) + + if use_data_loader: + train_reader = paddle.batch(reader_decorator( + paddle.dataset.flowers.train(use_xmap=False)), + batch_size=batch_size, + drop_last=True) + train_loader = fluid.io.DataLoader.from_generator( + capacity=4, + use_double_buffer=True, + iterable=True, + return_list=True) + train_loader.set_sample_list_generator(train_reader) + train_reader = train_loader + + if enable_amp and (level == 'O2'): + resnet = paddle.amp.decorate(models=resnet, level='O2') + + for batch_id, data in enumerate(train_reader()): + if batch_id >= batch_num: + break + if use_data_loader: + img, label = data + else: + dy_x_data = np.array([x[0].reshape(3, 224, 224) + for x in data]).astype('float32') + if len(np.array([x[1] + for x in data]).astype('int64')) != batch_size: + continue + y_data = np.array([x[1] for x in data + ]).astype('int64').reshape(-1, 1) + + img = paddle.to_tensor(dy_x_data) + label = paddle.to_tensor(y_data) + label.stop_gradient = True + + with paddle.amp.auto_cast(enable=enable_amp, level=level): + out = resnet(img) + + loss = paddle.nn.functional.cross_entropy(input=out, label=label) + loss = paddle.cast(loss, 'float32') + avg_loss = paddle.mean(x=loss) + + dy_out = avg_loss.numpy() + + scaled_loss = scaler.scale(avg_loss) + scaled_loss.backward() + scaler.unscale_(optimizer) + scaler.step(optimizer) + scaler.update() + + dy_grad_value = {} + for param in resnet.parameters(): + if param.trainable: + np_array = np.array(param._grad_ivar().value().get_tensor()) + dy_grad_value[param.name + + fluid.core.grad_var_suffix()] = np_array + + resnet.clear_gradients() + + dy_param_value = {} + for param in resnet.parameters(): + dy_param_value[param.name] = param.numpy() + if use_data_loader: + train_reader._reset() + return dy_out, dy_param_value, dy_grad_value + + def test_resnet(self): + + def func_isinstance(): + with fluid.dygraph.guard(): + out_fp32 = self.train_resnet(enable_amp=False) + out_amp = self.train_resnet(enable_amp=True) + out_pure_fp16 = self.train_resnet(enable_amp=True, level='O2') + print(out_fp32[0], out_amp[0], out_pure_fp16[0]) + self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-5)) + self.assertTrue( + np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-2)) + + func_isinstance() + + def test_with_data_loader(self): + + def func_isinstance(): + with fluid.dygraph.guard(): + out_fp32 = self.train_resnet(enable_amp=False, + use_data_loader=True) + out_amp = self.train_resnet(enable_amp=True, + use_data_loader=True) + out_pure_fp16 = self.train_resnet(enable_amp=True, + use_data_loader=True, + level='O2') + print(out_fp32[0], out_amp[0], out_pure_fp16[0]) + self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-5)) + self.assertTrue( + np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-2)) + + func_isinstance() + + def test_param_group(self): + + def func_isinstance(): + with fluid.dygraph.guard(): + out_fp32 = self.train_resnet(enable_amp=False, + use_data_loader=True, + use_param_group=True) + out_amp = self.train_resnet(enable_amp=True, + use_data_loader=True, + use_param_group=True) + out_pure_fp16 = self.train_resnet(enable_amp=True, + use_data_loader=True, + use_param_group=True, + level='O2') + print(out_fp32[0], out_amp[0], out_pure_fp16[0]) + self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-5)) + self.assertTrue( + np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-2)) + + func_isinstance() + + +class TestResnet(unittest.TestCase): + """ + Use paddle-1.x API + """ + + def train_resnet(self, enable_amp=True, level='O1'): + seed = 90 + + batch_size = train_parameters["batch_size"] + batch_num = 1 + + with fluid.dygraph.guard(): + paddle.seed(seed) + paddle.framework.random._manual_program_seed(seed) + + resnet = ResNet(use_cudnn=True) + optimizer = optimizer_setting(train_parameters, + parameter_list=resnet.parameters()) + optimizer = paddle.optimizer.Momentum( + parameters=resnet.parameters(), multi_precision=True) + np.random.seed(seed) + train_reader = paddle.batch( + paddle.dataset.flowers.train(use_xmap=False), + batch_size=batch_size) + + dy_param_init_value = {} + for param in resnet.parameters(): + dy_param_init_value[param.name] = param.numpy() + + program = None + scaler = paddle.fluid.dygraph.AmpScaler(enable=enable_amp, + init_loss_scaling=2.**10) + + if enable_amp and (level == 'O2'): + resnet, optimizer = paddle.fluid.dygraph.amp_decorate( + models=resnet, optimizers=optimizer, level='O2') + + for batch_id, data in enumerate(train_reader()): + if batch_id >= batch_num: + break + dy_x_data = np.array([x[0].reshape(3, 224, 224) + for x in data]).astype('float32') + if len(np.array([x[1] + for x in data]).astype('int64')) != batch_size: + continue + y_data = np.array([x[1] for x in data + ]).astype('int64').reshape(-1, 1) + img = fluid.dygraph.to_variable(dy_x_data) + label = fluid.dygraph.to_variable(y_data) + label.stop_gradient = True + with paddle.fluid.dygraph.amp_guard(enable=enable_amp, + level=level): + out = resnet(img) + + loss = fluid.layers.cross_entropy(input=out, label=label) + avg_loss = fluid.layers.mean(x=loss) + + dy_out = avg_loss.numpy() + + scaled_loss = scaler.scale(avg_loss) + scaled_loss.backward() + + scaler.minimize(optimizer, scaled_loss) + + dy_grad_value = {} + for param in resnet.parameters(): + if param.trainable: + np_array = np.array( + param._grad_ivar().value().get_tensor()) + dy_grad_value[param.name + + fluid.core.grad_var_suffix()] = np_array + + resnet.clear_gradients() + + dy_param_value = {} + for param in resnet.parameters(): + dy_param_value[param.name] = param.numpy() + + return dy_out, dy_param_value, dy_grad_value + + def test_resnet(self): + + def func_isinstance(): + out_fp32 = self.train_resnet(enable_amp=False) + out_amp = self.train_resnet(enable_amp=True) + out_pure_fp16 = self.train_resnet(enable_amp=True, level='O2') + print(out_fp32[0], out_amp[0], out_pure_fp16[0]) + self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-2)) + self.assertTrue( + np.allclose(out_fp32[0], out_pure_fp16[0], atol=1.e-1)) + + func_isinstance() + + +class TestLayerNormFp16(unittest.TestCase): + r''' layer_norm and batch_norm support mixed inputs, i.e., only input x is fp16 + and other params are fp32. + ''' + + def test_layer_norm_fp16(self): + + def func_isinstance(): + if fluid.is_compiled_with_cuda(): + with fluid.dygraph.guard(fluid.CUDAPlace(0)): + x = paddle.rand([2, 2, 2, 3]) + layer_norm = paddle.nn.LayerNorm(x.shape[1:]) + with paddle.amp.auto_cast(custom_white_list=['layer_norm']): + out = layer_norm(x) + + self.assertTrue( + out.dtype == fluid.core.VarDesc.VarType.FP16) + + func_isinstance() + + +class TestBf16(unittest.TestCase): + ''' + test amp for BF16 + ''' + + def train(self, enable_amp=True, amp_level='O1'): + paddle.seed(100) + input = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.) + conv = paddle.nn.Conv2D(4, 6, (3, 3)) + with paddle.amp.auto_cast(enable=enable_amp, + level=amp_level, + dtype='bfloat16'): + output = conv(input) + output = output.cast('float32') + return output.numpy() + + def test_bf16(self): + + def func_isinstance(): + if fluid.core.is_compiled_with_cuda( + ) and fluid.core.is_bfloat16_supported(paddle.CUDAPlace(0)): + out_fp32 = self.train(enable_amp=False) + out_bf16_O1 = self.train(enable_amp=True, amp_level='O1') + out_bf16_O2 = self.train(enable_amp=True, amp_level='O2') + self.assertTrue( + np.allclose(out_fp32, out_bf16_O1, rtol=1.e-3, atol=1.e-1)) + self.assertTrue( + np.allclose(out_fp32, out_bf16_O2, rtol=1.e-3, atol=1.e-1)) + + func_isinstance() + + +class TestAmpWithPyLyer(unittest.TestCase): + + def test_pylayer(self): + + class MyMM(PyLayer): + + @staticmethod + def forward(ctx, a, b): + ctx.save_for_backward(a, b) + return a.mm(b) + + @staticmethod + def backward(ctx, grad): + a, b = ctx.saved_tensor() + # NOTE(zhiqiu): a and b is float32 now, while grad is fp16 when forward runs with auto_cast() + # thus, the mm operation raise errors because of the dtype of inputs are inconsistent before. + return grad.mm(b.t()), a.t().mm(grad) + + x = paddle.rand([10, 10]) + y = paddle.rand([10, 10]) + x.stop_gradient = False + y.stop_gradient = False + + # with paddle.amp.auto_cast(): + res = MyMM.apply(x, y) + loss = paddle.mean(res) + loss.backward() + + +class TestAmpWithHook(unittest.TestCase): + + def test_hook_change_dtype(self): + + def func_isinstance(): + with paddle.fluid.dygraph.guard(): + v = paddle.rand([3, 3]) + v.stop_gradient = False + + def foo(grad): + print('grad', grad, grad.dtype) # grad's dtype is float32 + res = paddle.mm(grad, grad) # mm runs in fp16 + print('res', res, res.dtype) # res's dtype is float16 + return res + + v.register_hook(foo) + with paddle.amp.auto_cast(): + a = paddle.mm(v, v) + loss = a.sum() + self.assertRaises(RuntimeError, loss.backward) + + func_isinstance() + + def test_hook_change_place(self): + + def func_isinstance(): + with paddle.fluid.dygraph.guard(): + v = paddle.rand([3, 3]) + v.stop_gradient = False + + def foo(grad): + res = grad.cpu() # change place + return res + + v.register_hook(foo) + with paddle.amp.auto_cast(): + a = paddle.mm(v, v) + loss = a.sum() + self.assertRaises(RuntimeError, loss.backward) + + func_isinstance() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_pylayer_op.py b/python/paddle/fluid/tests/unittests/test_pylayer_op.py index d55e427f286c37dce63c7b780b68139f76ae46f2..f7f5e81b841ed5d736455eb34155ba70fac63273 100644 --- a/python/paddle/fluid/tests/unittests/test_pylayer_op.py +++ b/python/paddle/fluid/tests/unittests/test_pylayer_op.py @@ -18,7 +18,7 @@ import unittest import numpy as np import paddle -from paddle.autograd import PyLayer, EagerPyLayer +from paddle.autograd.py_layer import LegacyPyLayer, EagerPyLayer from paddle.fluid.framework import _test_eager_guard, in_dygraph_mode @@ -32,7 +32,7 @@ class TestPyLayer(unittest.TestCase): def func_test_simple_pylayer_multiple_output(self): - class tanh(EagerPyLayer if in_dygraph_mode() else PyLayer): + class tanh(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer): @staticmethod def forward(ctx, x1, x2, func1, func2=paddle.square): @@ -70,7 +70,7 @@ class TestPyLayer(unittest.TestCase): def func_test_simple_pylayer_return_none_with_no_grad(self): - class tanh(EagerPyLayer if in_dygraph_mode() else PyLayer): + class tanh(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer): @staticmethod def forward(ctx, x1, x2, func1, func2=paddle.square): @@ -112,7 +112,7 @@ class TestPyLayer(unittest.TestCase): def func_test_simple_pylayer_single_output(self): - class tanh(EagerPyLayer if in_dygraph_mode() else PyLayer): + class tanh(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer): @staticmethod def forward(ctx, x1, func1, func2=paddle.square): @@ -146,7 +146,7 @@ class TestPyLayer(unittest.TestCase): def func_test_pylayer_num_output_match(self): - class tanh(EagerPyLayer if in_dygraph_mode() else PyLayer): + class tanh(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer): @staticmethod def forward( @@ -175,7 +175,7 @@ class TestPyLayer(unittest.TestCase): def func_test_pylayer_dtype(self): - class tanh(EagerPyLayer if in_dygraph_mode() else PyLayer): + class tanh(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer): @staticmethod def forward(ctx, x, dtype): @@ -206,7 +206,7 @@ class TestPyLayer(unittest.TestCase): def func_test_pylayer_Exception_forward(self): - class Layer_None1(EagerPyLayer if in_dygraph_mode() else PyLayer): + class Layer_None1(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer): @staticmethod def forward(ctx, *args): @@ -220,7 +220,7 @@ class TestPyLayer(unittest.TestCase): with self.assertRaises(ValueError): z = Layer_None1.apply(input1) - class Layer_None2(EagerPyLayer if in_dygraph_mode() else PyLayer): + class Layer_None2(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer): @staticmethod def forward(ctx, *args): @@ -234,7 +234,7 @@ class TestPyLayer(unittest.TestCase): # return None z = Layer_None2.apply(input1) - class Layer_one1(EagerPyLayer if in_dygraph_mode() else PyLayer): + class Layer_one1(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer): @staticmethod def forward(ctx, *args): @@ -249,7 +249,7 @@ class TestPyLayer(unittest.TestCase): with self.assertRaises(ValueError): z = Layer_one1.apply(input1) - class Layer_one2(EagerPyLayer if in_dygraph_mode() else PyLayer): + class Layer_one2(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer): @staticmethod def forward(ctx, *args): @@ -263,7 +263,7 @@ class TestPyLayer(unittest.TestCase): # return int z = Layer_one2.apply(input1) - class Layer_no_fw(EagerPyLayer if in_dygraph_mode() else PyLayer): + class Layer_no_fw(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer): @staticmethod def backward(ctx, *args): @@ -280,7 +280,7 @@ class TestPyLayer(unittest.TestCase): def func_test_pylayer_nograd(self): - class tanh(EagerPyLayer if in_dygraph_mode() else PyLayer): + class tanh(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer): @staticmethod def forward(ctx, x1, func1, func2=paddle.square, xx=None): @@ -305,7 +305,8 @@ class TestPyLayer(unittest.TestCase): def func_test_pylayer_Exception_bk(self): - class Layer_bk_none1(EagerPyLayer if in_dygraph_mode() else PyLayer): + class Layer_bk_none1( + EagerPyLayer if in_dygraph_mode() else LegacyPyLayer): @staticmethod def forward(ctx, x): @@ -322,7 +323,8 @@ class TestPyLayer(unittest.TestCase): with self.assertRaises(ValueError): z.sum().backward() - class Layer_bk_none2(EagerPyLayer if in_dygraph_mode() else PyLayer): + class Layer_bk_none2( + EagerPyLayer if in_dygraph_mode() else LegacyPyLayer): @staticmethod def forward(ctx, x1, x2): @@ -339,7 +341,8 @@ class TestPyLayer(unittest.TestCase): with self.assertRaises(ValueError): z.mean().backward() - class Layer_bk_one1(EagerPyLayer if in_dygraph_mode() else PyLayer): + class Layer_bk_one1(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer + ): @staticmethod def forward(ctx, x): @@ -356,7 +359,8 @@ class TestPyLayer(unittest.TestCase): with self.assertRaises(ValueError): z.mean().backward() - class Layer_bk_one2(EagerPyLayer if in_dygraph_mode() else PyLayer): + class Layer_bk_one2(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer + ): @staticmethod def forward(ctx, x1, x2): @@ -374,7 +378,7 @@ class TestPyLayer(unittest.TestCase): with self.assertRaises(ValueError): z.mean().backward() - class Layer_no_bk(EagerPyLayer if in_dygraph_mode() else PyLayer): + class Layer_no_bk(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer): @staticmethod def forward(ctx, x): @@ -388,7 +392,8 @@ class TestPyLayer(unittest.TestCase): z = z[0] + z[1] z.mean().backward() - class Layer_bk_match(EagerPyLayer if in_dygraph_mode() else PyLayer): + class Layer_bk_match( + EagerPyLayer if in_dygraph_mode() else LegacyPyLayer): @staticmethod def forward(ctx, x): @@ -412,7 +417,8 @@ class TestPyLayer(unittest.TestCase): def func_test_pylayer_bk_return_none(self): - class Layer_bk_none1(EagerPyLayer if in_dygraph_mode() else PyLayer): + class Layer_bk_none1( + EagerPyLayer if in_dygraph_mode() else LegacyPyLayer): @staticmethod def forward(ctx, x1, x2): @@ -431,7 +437,8 @@ class TestPyLayer(unittest.TestCase): with self.assertRaises(ValueError): z.mean().backward() - class Layer_bk_none2(EagerPyLayer if in_dygraph_mode() else PyLayer): + class Layer_bk_none2( + EagerPyLayer if in_dygraph_mode() else LegacyPyLayer): @staticmethod def forward(ctx, x1, x2): @@ -457,7 +464,7 @@ class TestPyLayer(unittest.TestCase): def func_test_pylayer_inplace(self): - class cus_tanh(EagerPyLayer if in_dygraph_mode() else PyLayer): + class cus_tanh(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer): @staticmethod def forward(ctx, x): @@ -494,7 +501,8 @@ class TestPyLayer(unittest.TestCase): def test_pylayer_inplace_backward_error(self): with _test_eager_guard(): - class cus_tanh(EagerPyLayer if in_dygraph_mode() else PyLayer): + class cus_tanh(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer + ): @staticmethod def forward(ctx, x): @@ -530,7 +538,8 @@ class TestPyLayer(unittest.TestCase): def test_pylayer_inplace_backward_success_1(self): with _test_eager_guard(): - class cus_tanh(EagerPyLayer if in_dygraph_mode() else PyLayer): + class cus_tanh(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer + ): @staticmethod def forward(ctx, x): @@ -564,7 +573,8 @@ class TestPyLayer(unittest.TestCase): def test_pylayer_inplace_backward_success_2(self): with _test_eager_guard(): - class cus_tanh(EagerPyLayer if in_dygraph_mode() else PyLayer): + class cus_tanh(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer + ): @staticmethod def forward(ctx, x): @@ -597,7 +607,8 @@ class TestPyLayer(unittest.TestCase): def func_test_pylayer_inplace_and_leaf_exception(self): - class cus_pylayer_op(EagerPyLayer if in_dygraph_mode() else PyLayer): + class cus_pylayer_op( + EagerPyLayer if in_dygraph_mode() else LegacyPyLayer): @staticmethod def forward(ctx, x): @@ -633,7 +644,7 @@ class TestPyLayer(unittest.TestCase): def func_test_backward_in_backward(self): - class cus_tanh(EagerPyLayer if in_dygraph_mode() else PyLayer): + class cus_tanh(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer): @staticmethod def forward(ctx, x): @@ -665,7 +676,7 @@ class TestPyLayer(unittest.TestCase): def func_test_return_to_tensor(self): - class Tanh(EagerPyLayer if in_dygraph_mode() else PyLayer): + class Tanh(EagerPyLayer if in_dygraph_mode() else LegacyPyLayer): @staticmethod def forward(ctx, x1): @@ -779,7 +790,7 @@ class TestPyLayerReturnType(unittest.TestCase): def test_forward_args_fake_tensor(self): - class Tanh(PyLayer): + class Tanh(LegacyPyLayer): @staticmethod def forward(ctx, x1): @@ -797,7 +808,7 @@ class TestPyLayerReturnType(unittest.TestCase): def test_forward_kwargs_fake_tensor(self): - class Tanh(PyLayer): + class Tanh(LegacyPyLayer): @staticmethod def forward(ctx, x1): @@ -815,7 +826,7 @@ class TestPyLayerReturnType(unittest.TestCase): def test_forward_return_fake_tensor(self): - class Tanh(PyLayer): + class Tanh(LegacyPyLayer): @staticmethod def forward(ctx, x1): @@ -833,7 +844,7 @@ class TestPyLayerReturnType(unittest.TestCase): def test_forward_return_fake_tensor_tuple(self): - class Tanh(PyLayer): + class Tanh(LegacyPyLayer): @staticmethod def forward(ctx, x1): @@ -851,7 +862,7 @@ class TestPyLayerReturnType(unittest.TestCase): def test_backward_return_fake_tensor_tuple(self): - class Tanh(PyLayer): + class Tanh(LegacyPyLayer): @staticmethod def forward(ctx, x1, x2): @@ -871,7 +882,7 @@ class TestPyLayerReturnType(unittest.TestCase): def test_backward_return_fake_tensor(self): - class Tanh(PyLayer): + class Tanh(LegacyPyLayer): @staticmethod def forward(ctx, x1): diff --git a/python/paddle/incubate/distributed/models/moe/moe_layer.py b/python/paddle/incubate/distributed/models/moe/moe_layer.py index 367b2c189e3adc806da5ad5c42f601f668ffe0f1..28740917c13f830581ecf888c480371cbd5a4b81 100644 --- a/python/paddle/incubate/distributed/models/moe/moe_layer.py +++ b/python/paddle/incubate/distributed/models/moe/moe_layer.py @@ -31,7 +31,7 @@ from paddle.distributed import alltoall, all_gather from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker from paddle.distributed import fleet -from paddle.autograd import PyLayer, EagerPyLayer +from paddle.autograd import PyLayer from .gate import NaiveGate, GShardGate, SwitchGate, BaseGate from .utils import count_by_gate from paddle.distributed.fleet.meta_parallel.pp_utils.utils import _hp_recompute @@ -132,53 +132,6 @@ class MoEScatter(PyLayer): return grad_in, None, None, None -class EagerMoEScatter(EagerPyLayer): - r""" - Scatter input samples from [batch x sequences] to contiguous alone experts. - If `world_size` is greater than 1, the samples will first be locally - scattered, and then exchanged across workers. - """ - - @staticmethod - def forward(ctx, - inp, - pos, - local_expert_count, - global_expert_count, - fwd_batch_size, - world_size, - group=None): - local_input_buf = _local_scatter(inp, pos) - if world_size > 1: - global_input_buf = global_scatter(local_input_buf, - local_expert_count, - global_expert_count, - group=group) - else: - global_input_buf = local_input_buf - - ctx.moe_args = inp.shape[0], world_size, group - - variables = (pos, local_expert_count, global_expert_count) - ctx.save_for_backward(*variables) - return global_input_buf - - @staticmethod - def backward(ctx, grad): - (pos, local_expert_count, global_expert_count) = ctx.saved_tensor() - (inp_batch_size, world_size, group) = ctx.moe_args - - if world_size > 1: - local_grad_in = global_gather(grad, - local_expert_count, - global_expert_count, - group=group) - else: - local_grad_in = grad - grad_in = _local_gather(local_grad_in, pos, inp_batch_size) - return grad_in, None, None, None - - class MoEGather(PyLayer): r""" Gather output samples from contiguous alone experts back to [batch x @@ -226,53 +179,6 @@ class MoEGather(PyLayer): return global_grad_out_buf, None, None, None -class EagerMoEGather(EagerPyLayer): - r""" - Gather output samples from contiguous alone experts back to [batch x - sequences]. Works symmetrically with MoEScatter. - """ - - @staticmethod - def forward(ctx, - global_output_buf, - pos, - local_expert_count, - global_expert_count, - local_batch_size, - world_size, - group=None): - if world_size > 1: - local_output_buf = global_gather(global_output_buf, - local_expert_count, - global_expert_count, - group=group) - else: - local_output_buf = global_output_buf - output = _local_gather(local_output_buf, - pos, - local_batch_size, - maybe_overlap=False) - - ctx.moe_args = (global_output_buf.shape[0], world_size, group) - variables = (pos, local_expert_count, global_expert_count) - ctx.save_for_backward(*variables) - return output - - @staticmethod - def backward(ctx, grad_out): - pos, local_expert_count, global_expert_count = ctx.saved_tensor() - fwd_batch_size, world_size, group = ctx.moe_args - grad_out_buf = _local_scatter(grad_out, pos) - if world_size > 1: - global_grad_out_buf = global_scatter(grad_out_buf, - local_expert_count, - global_expert_count, - group=group) - else: - global_grad_out_buf = grad_out_buf - return global_grad_out_buf, None, None, None - - class AllGather(PyLayer): r""" A wrapper for the All-Gather function to support auto-differentiation. @@ -295,28 +201,6 @@ class AllGather(PyLayer): ends=[(rank + 1) * dim0]) -class EagerAllGather(EagerPyLayer): - r""" - A wrapper for the All-Gather function to support auto-differentiation. - """ - - @staticmethod - def forward(ctx, inp, rank, world_size, group): - tensor_list = [] - paddle.distributed.all_gather(tensor_list, inp, group=group) - output = paddle.concat(tensor_list, axis=0) - ctx.args = rank, inp.shape[0] - return output - - @staticmethod - def backward(ctx, grad_out): - rank, dim0 = ctx.args - return paddle.slice(grad_out, - axes=[0], - starts=[rank * dim0], - ends=[(rank + 1) * dim0]) - - class Slice(PyLayer): r""" A wrapper for the Slice function to support auto-differentiation. @@ -341,30 +225,6 @@ class Slice(PyLayer): return _all_gather(grad_out, group=group) -class EagerSlice(EagerPyLayer): - r""" - A wrapper for the Slice function to support auto-differentiation. - """ - - @staticmethod - def forward(ctx, inp, rank, world_size, group): - B = inp.shape[0] - local_batch_size = B // world_size - batch_start = local_batch_size * rank - batch_end = min(batch_start + local_batch_size, B) - inp = paddle.slice(inp, - axes=[0], - starts=[batch_start], - ends=[batch_end]) - ctx.args = world_size, group - return inp - - @staticmethod - def backward(ctx, grad_out): - world_size, group = ctx.args - return _all_gather(grad_out, group=group) - - def prepare_forward(gate, num_expert, world_size, moe_group): pos, local_expert_count, global_expert_count = count_by_gate( gate, num_expert, world_size, group=moe_group) @@ -517,10 +377,7 @@ class MoELayer(nn.Layer): mp_rank = self.mp_group.rank mp_size = self.mp_group.nranks if mp_size > 1: - if in_dygraph_mode(): - inp = EagerSlice.apply(inp, mp_rank, mp_size, self.mp_group) - else: - inp = Slice.apply(inp, mp_rank, mp_size, self.mp_group) + inp = Slice.apply(inp, mp_rank, mp_size, self.mp_group) value, gate = self.gate(inp) ( @@ -541,14 +398,9 @@ class MoELayer(nn.Layer): temp_pos = pos assert topk == self.top_k - if in_dygraph_mode(): - x = EagerMoEScatter.apply(inp, temp_pos, local_expert_count, - global_expert_count, fwd_batch_size, - self.world_size, self.group) - else: - x = MoEScatter.apply(inp, temp_pos, local_expert_count, - global_expert_count, fwd_batch_size, - self.world_size, self.group) + x = MoEScatter.apply(inp, temp_pos, local_expert_count, + global_expert_count, fwd_batch_size, + self.world_size, self.group) d_model = self.d_model @@ -577,23 +429,15 @@ class MoELayer(nn.Layer): if len(gate.shape) == 2: out_batch_size *= gate.shape[1] - if in_dygraph_mode(): - x = EagerMoEGather.apply(x, pos, local_expert_count, - global_expert_count, out_batch_size, - self.world_size, self.group) - else: - x = MoEGather.apply(x, pos, local_expert_count, global_expert_count, - out_batch_size, self.world_size, self.group) + x = MoEGather.apply(x, pos, local_expert_count, global_expert_count, + out_batch_size, self.world_size, self.group) x = x.reshape([-1, self.top_k, d_model]) value = value.reshape([x.shape[0], 1, self.top_k]) x = paddle.bmm(value, x).reshape([-1, d_model]) if mp_size > 1: - if in_dygraph_mode(): - x = EagerAllGather.apply(x, mp_rank, mp_size, self.mp_group) - else: - x = AllGather.apply(x, mp_rank, mp_size, self.mp_group) + x = AllGather.apply(x, mp_rank, mp_size, self.mp_group) x = paddle.reshape_(x, origin_shape)