From 583ebab760ec3e07b251623e98ee0071a02e6ad2 Mon Sep 17 00:00:00 2001
From: JZ-LIANG <38102074+JZ-LIANG@users.noreply.github.com>
Date: Sun, 25 Apr 2021 22:19:34 +0800
Subject: [PATCH] Dygraph Recompute (#32516)

* Dygraph reocmpute

* unitest for Dygraph reocmpute

* dy recompute remove unitest for win and mac
---
 .../distributed/fleet/utils/__init__.py       |   1 +
 .../distributed/fleet/utils/recompute.py      | 177 ++++++++++++++++++
 .../fluid/tests/unittests/CMakeLists.txt      |   1 +
 .../tests/unittests/test_dygraph_recompute.py | 176 +++++++++++++++++
 4 files changed, 355 insertions(+)
 create mode 100644 python/paddle/distributed/fleet/utils/recompute.py
 create mode 100755 python/paddle/fluid/tests/unittests/test_dygraph_recompute.py

diff --git a/python/paddle/distributed/fleet/utils/__init__.py b/python/paddle/distributed/fleet/utils/__init__.py
index 774e8db0df5..0a47750ead7 100644
--- a/python/paddle/distributed/fleet/utils/__init__.py
+++ b/python/paddle/distributed/fleet/utils/__init__.py
@@ -14,3 +14,4 @@
 
 from .fs import LocalFS, HDFSClient
 from .ps_util import DistributedInfer
+from .recompute import recompute
diff --git a/python/paddle/distributed/fleet/utils/recompute.py b/python/paddle/distributed/fleet/utils/recompute.py
new file mode 100644
index 00000000000..0dc305ec77d
--- /dev/null
+++ b/python/paddle/distributed/fleet/utils/recompute.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.fluid import core
+from paddle.autograd import PyLayer
+from paddle.fluid import framework
+import contextlib
+
+import logging
+logging.basicConfig(
+    format='%(asctime)s %(levelname)-8s %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S')
+
+
+def detach_variable(inputs):
+    out = []
+    for inp in inputs:
+        if not isinstance(inp, core.VarBase):
+            out.append(inp)
+            continue
+
+        x = inp.detach()
+        x.stop_gradient = inp.stop_gradient
+        out.append(x)
+    return tuple(out)
+
+
+def check_recompute_necessary(inputs):
+    if not any(input_.stop_gradient == False for input_ in inputs
+               if isinstance(input_, paddle.Tensor)):
+        logging.warn(
+            "[Recompute]: None of the inputs to current recompute block need grad, "
+            "therefore there is NO need to recompute this block in backward !")
+
+
+@contextlib.contextmanager
+def swith_rng_state(rng_state):
+    orig_cuda_rng_state = paddle.get_cuda_rng_state()
+    paddle.set_cuda_rng_state(rng_state)
+    try:
+        yield
+    finally:
+        paddle.set_cuda_rng_state(orig_cuda_rng_state)
+
+
+class RecomputeFunction(PyLayer):
+    @staticmethod
+    def forward(ctx, run_function, preserve_rng_state, *args):
+        check_recompute_necessary(args)
+
+        # store for recomputing 
+        ctx.run_function = run_function
+        ctx.preserve_rng_state = preserve_rng_state
+
+        # NOTE the number of outputs of backward() should be equal to the number of tensors in forward()'s input
+        # the order of tensors in backward()'s output should be the same as tensors in forward()'s input
+        # None tensor inputs will be filtered in backward inputs.
+
+        # save input for backward
+        ctx.inputs = []
+        ctx.tensor_indices = []
+        tensor_inputs = []
+        for i, arg in enumerate(args):
+            if paddle.is_tensor(arg):
+                tensor_inputs.append(arg)
+                ctx.tensor_indices.append(i)
+                ctx.inputs.append(None)
+            else:
+                ctx.inputs.append(arg)
+        ctx.save_for_backward(*tensor_inputs)
+
+        # NOTE recompute with restore RNG only support one senario where one process for one cuda gpu.
+        # one process with multiple gpu and mix-gpu-cpu senarios are not support
+        if ctx.preserve_rng_state:
+            cur_device = paddle.get_device()
+            if 'gpu:' not in cur_device:
+                raise RuntimeError(
+                    "Recompute with RNG perserve is not support current device: {}.".
+                    format(cur_device))
+            ctx.fw_cuda_rng_state = paddle.get_cuda_rng_state()
+
+        # TODO support AMP
+
+        with paddle.no_grad():
+            outputs = run_function(*args)
+
+        return outputs
+
+    @staticmethod
+    def backward(ctx, *args):
+        with paddle.fluid.dygraph.guard():
+            # TODO need to check the recompute calling is vaild or not
+
+            # Restore inputs
+            inputs = list(ctx.inputs)
+            tensor_indices = ctx.tensor_indices
+            tensors = ctx.saved_tensor()
+            for i, idx in enumerate(tensor_indices):
+                inputs[idx] = tensors[i]
+
+            # paddle.enable_grad()
+            tracer = framework._dygraph_tracer()
+            tracer._has_grad = True
+
+            # TODO support AMP
+
+            if ctx.preserve_rng_state:
+                with swith_rng_state(ctx.fw_cuda_rng_state):
+                    detached_inputs = detach_variable(tuple(inputs))
+                    outputs = ctx.run_function(*detached_inputs)
+            else:
+                detached_inputs = detach_variable(tuple(inputs))
+                outputs = ctx.run_function(*detached_inputs)
+
+            if isinstance(outputs, core.VarBase):
+                outputs = (outputs, )
+            assert len(outputs) == len(args)
+
+            # run backward() with only tensor that requires grad
+            forward_outputs_with_grad = []
+            backward_inputs = list(args)
+            for i in range(len(outputs)):
+                if isinstance(outputs[i],
+                              core.VarBase) and not outputs[i].stop_gradient:
+                    forward_outputs_with_grad.append(outputs[i])
+            if len(forward_outputs_with_grad) == 0:
+                raise RuntimeError(
+                    "none of output has requires_grad=True, this recompute() is not necessary"
+                )
+
+            assert len(backward_inputs) == len(
+                forward_outputs_with_grad
+            ), "number of forward outputs is [{}], but the backward got [{}] inputs".format(
+                len(forward_outputs_with_grad), len(backward_inputs))
+
+            # actually backward            
+            paddle.autograd.backward(forward_outputs_with_grad, backward_inputs)
+
+            grads = list(inp._grad_ivar() for inp in detached_inputs
+                         if isinstance(inp, core.VarBase))
+
+            return grads
+
+
+def recompute(function, *args, **kwargs):
+    """
+    recompute intermediate activations to save then memory.
+
+    Args:
+        function: layer of sequence of layers that describes part of forward pass of the model whose 
+        intermediate activations will be released to save memory in forward stage and will be recomputed 
+        in backward stage for gradient calculation.
+        preserve_rng_state(bool, optional):  if preserve the RNG state of forward and restore it in backward. 
+        args: inputs to the function
+
+    Returns:
+        Output of function on args
+    """
+    # Hack to mix *args with **kwargs in a python 2.7-compliant way
+    preserve = kwargs.pop('preserve_rng_state', True)
+    if kwargs:
+        raise ValueError("Unexpected keyword arguments: " + ",".join(
+            arg for arg in kwargs))
+
+    return RecomputeFunction.apply(function, preserve, *args)
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 2e68dd899ee..ae3cf5f2858 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -176,6 +176,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_pipeline_layer)
     LIST(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision)
     LIST(REMOVE_ITEM TEST_OPS test_fleet_base_single)
+    LIST(REMOVE_ITEM TEST_OPS test_dygraph_recompute)
 elseif(WITH_GPU)
     if (${CUDNN_VERSION} VERSION_LESS 7100)
         LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py b/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py
new file mode 100755
index 00000000000..6de04c14bfa
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py
@@ -0,0 +1,176 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+from paddle.autograd import PyLayer
+from paddle.distributed.fleet.utils import recompute
+import random
+
+import paddle.fluid.layers as layers
+
+
+def get_fc_block(block_idx, input_size, is_last=False):
+    block_name = "block_" + str(block_idx)
+    block = paddle.nn.Sequential(
+        (block_name + "_fc_0", paddle.nn.Linear(
+            input_size, input_size, bias_attr=False)),
+        (block_name + "_dropout", paddle.nn.Dropout(p=0.5)),
+        (block_name + "_relu_1", paddle.nn.ReLU()),
+        (block_name + "_fc_1", paddle.nn.Linear(
+            input_size, input_size, bias_attr=False)),
+        (block_name + "_relu_2", paddle.nn.ReLU()), )
+    if is_last:
+        block.add_sublayer(
+            block_name + "_fc_2",
+            paddle.nn.Linear(
+                input_size, 1, bias_attr=False))  # add sublayer
+    else:
+        block.add_sublayer(
+            block_name + "_fc_2",
+            paddle.nn.Linear(
+                input_size, input_size, bias_attr=False))  # add sublayer
+    return block
+
+
+class Naive_fc_net(paddle.nn.Layer):
+    def __init__(self,
+                 input_size=10,
+                 recompute_blocks=[1, 3],
+                 recompute_kwargs={}):
+        super(Naive_fc_net, self).__init__()
+        self.recompute_blocks = recompute_blocks
+        self.recompute_kwargs = recompute_kwargs
+        self.runfunc0 = get_fc_block(0, input_size, is_last=False)
+        self.runfunc1 = get_fc_block(1, input_size, is_last=False)
+        self.runfunc2 = get_fc_block(2, input_size, is_last=False)
+        self.runfunc3 = get_fc_block(3, input_size, is_last=False)
+        self.runfunc4 = get_fc_block(4, input_size, is_last=True)
+
+    def forward(self, inputs):
+
+        if 0 in self.recompute_blocks:
+            inputs = recompute(self.runfunc0, inputs)
+        else:
+            inputs = self.runfunc0(inputs)
+
+        if 1 in self.recompute_blocks:
+            inputs = recompute(self.runfunc1, inputs)
+        else:
+            inputs = self.runfunc1(inputs)
+
+        if 2 in self.recompute_blocks:
+            inputs = recompute(self.runfunc2, inputs, **self.recompute_kwargs)
+        else:
+            inputs = self.runfunc2(inputs)
+
+        if 3 in self.recompute_blocks:
+            inputs = recompute(self.runfunc3, inputs)
+        else:
+            inputs = self.runfunc3(inputs)
+
+        if 4 in self.recompute_blocks:
+            inputs = recompute(self.runfunc4, inputs)
+        else:
+            inputs = self.runfunc4(inputs)
+
+        return inputs
+
+
+def run_model(cuda_state, recompute_block=[], recompute_kwargs={}):
+    gen = paddle.seed(10)
+    gen.manual_seed(10)
+    np.random.seed(10)
+    random.seed(10)
+
+    if cuda_state:
+        paddle.set_cuda_rng_state(cuda_state)
+
+    batch_size, input_size = 1, 10
+    model = Naive_fc_net(
+        input_size,
+        recompute_blocks=recompute_block,
+        recompute_kwargs=recompute_kwargs)
+    loss_fn = paddle.nn.MSELoss(reduction='mean')
+    optimizer = paddle.optimizer.SGD(learning_rate=0.01,
+                                     parameters=model.parameters())
+
+    loss_ = []
+    param_ = []
+    grad_ = []
+    for step in range(10):
+        x_data = np.random.randn(batch_size, input_size).astype(np.float32)
+        x = paddle.to_tensor(x_data)
+        # x.stop_gradient = False
+        y_pred = model(x)
+        loss = y_pred.mean()
+
+        loss_.append(np.asarray(loss).tolist())
+        loss.backward()
+        optimizer.step()
+
+        param_.append(np.asarray(model.parameters()[9]).tolist())
+        grad_.append(np.asarray(model.parameters()[3]._grad_ivar()).tolist())
+
+        optimizer.clear_grad()
+    return loss_, param_, grad_
+
+
+class TestPyLayer(unittest.TestCase):
+    def test_fc_net_with_dropout(self):
+        def check_identical(loss_ref, param_ref, grad_ref, loss, param, grad):
+            self.assertEqual(loss_ref, loss)
+            self.assertEqual(param_ref, param)
+            self.assertEqual(grad_ref, grad)
+
+        cuda_state = paddle.get_cuda_rng_state()
+        # without recompute
+        loss_ref, param_ref, grad_ref = run_model(
+            cuda_state, recompute_block=[])
+
+        # recompute second block
+        loss, param, grad = run_model(cuda_state, recompute_block=[1, 3])
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
+        # recompute fourth block
+        loss, param, grad = run_model(cuda_state, recompute_block=[3])
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
+        # recompute second to fourth block
+        loss, param, grad = run_model(cuda_state, recompute_block=[1, 2, 3])
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
+        # recompute second & fourth block
+        loss, param, grad = run_model(cuda_state, recompute_block=[1, 3])
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
+    def test_recompute_kwargs(self):
+        paddle.set_device("gpu")
+        kwargs = {"is_test": False}
+        with self.assertRaises(ValueError):
+            loss_ref, param_ref, grad_ref = run_model(
+                None, recompute_block=[2], recompute_kwargs=kwargs)
+
+    def test_recompute_cpu_rng(self):
+        paddle.set_device("cpu")
+        with self.assertRaises(RuntimeError):
+            loss_ref, param_ref, grad_ref = run_model(None, recompute_block=[2])
+
+
+if __name__ == '__main__':
+    unittest.main()
-- 
GitLab