From 83b953f56f68470cfb285d0c127a53681c32800f Mon Sep 17 00:00:00 2001 From: chentianyu03 Date: Thu, 1 Apr 2021 14:55:48 +0800 Subject: [PATCH] add custom init grad for backward function (#31540) * add custom init grad for backward function * add custom init grad for backward function * handle when the grad_tensor is none * handle when the grad_tensor is none * fix the args type error on windows platform * modify the args order and doc * format code * add grad_tensor to xpu * modify the grad_tensor type check * add paddle.backward api to support multi tensors gradient compute * add paddle.backward api to support multi tensors gradient compute * add paddle.atuograd module and backward api * change tensor.backward func args * modify tensor backward api * remove create_graph intputs args * add doc and examplex code for backward api * when have the same tensor, throw error * modify test Init func args * modify the execute.Init func args in test files * add paddle.autograd package in setup.py.in * modify error msg, remove _run_backward method in class Tensor * add test cases for backward api --- paddle/fluid/imperative/basic_engine.cc | 115 ++++++++++------- paddle/fluid/imperative/basic_engine.h | 6 +- paddle/fluid/imperative/tests/test_hooks.cc | 8 +- paddle/fluid/imperative/tests/test_tracer.cc | 9 +- paddle/fluid/pybind/imperative.cc | 26 ++-- python/paddle/__init__.py | 1 + python/paddle/autograd/__init__.py | 22 ++++ python/paddle/autograd/backward_mode.py | 119 ++++++++++++++++++ python/paddle/fluid/dygraph/base.py | 1 + .../fluid/dygraph/varbase_patch_methods.py | 36 +++++- .../tests/unittests/test_custom_grad_input.py | 119 ++++++++++++++++++ python/setup.py.in | 1 + 12 files changed, 397 insertions(+), 66 deletions(-) create mode 100644 python/paddle/autograd/__init__.py create mode 100644 python/paddle/autograd/backward_mode.py create mode 100644 python/paddle/fluid/tests/unittests/test_custom_grad_input.py diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc index 9e46af9cb72..2a439a6f1ea 100644 --- a/paddle/fluid/imperative/basic_engine.cc +++ b/paddle/fluid/imperative/basic_engine.cc @@ -36,48 +36,73 @@ DECLARE_bool(sort_sum_gradient); namespace paddle { namespace imperative { -void BasicEngine::Init(VarBase* var, bool retain_graph) { +void BasicEngine::Init( + const std::vector>& tensors, + const std::vector>& grad_tensors, + bool retain_graph) { retain_graph_ = retain_graph; - init_node_ = var->GradVarBase()->GradNode(); - PADDLE_ENFORCE_EQ(var->GradVarBase()->GraphIsFreed(), false, - platform::errors::Unavailable( - "%s trying to backward through the same graph a second " - "time, but this graph have already been freed. Please " - "specify Tensor.backward(retain_graph=True) when " - "calling backward at the first time.", - var->Name())); - - if (!retain_graph) { - VLOG(5) << "Clear the auto-grad graph from grad var " << var->Name() - << " because of retain_graph=False when calling backward"; - var->GradVarBase()->SetGraphIsFreed(true); - var->GradVarBase()->ClearGradNode(); - } - if (init_node_ == nullptr || var->OverridedStopGradient()) { - VLOG(3) << "Skip auto grad since there is no grad op for var or loss is " - "stop_gradient=True: " - << var->Name(); - return; - } + PADDLE_ENFORCE_EQ( + tensors.size(), grad_tensors.size(), + platform::errors::Unavailable( + "The size of tensors do not equal the size of grad_tensors," + "the size of tensors is %s, but the size of grad_tensors is %s.", + tensors.size(), grad_tensors.size())); + + for (size_t i = 0; i < tensors.size(); ++i) { + auto var = tensors[i]; + auto grad_tensor = grad_tensors[i]; + + auto init_node = var->GradVarBase()->GradNode(); + PADDLE_ENFORCE_EQ( + var->GradVarBase()->GraphIsFreed(), false, + platform::errors::Unavailable( + "%s trying to backward through the same graph a second " + "time, but this graph have already been freed. Please " + "specify Tensor.backward(retain_graph=True) when " + "calling backward at the first time.", + var->Name())); + + if (!retain_graph) { + VLOG(5) << "Clear the auto-grad graph from grad var " << var->Name() + << " because of retain_graph=False when calling backward"; + var->GradVarBase()->SetGraphIsFreed(true); + var->GradVarBase()->ClearGradNode(); + } - VLOG(3) << "Init first node of backward"; + if (init_node == nullptr || var->OverridedStopGradient()) { + VLOG(3) << "Skip auto grad since there is no grad op for var or loss is " + "stop_gradient=True: " + << var->Name(); + continue; + } - PADDLE_ENFORCE_EQ( - var->HasGradVar(), true, - platform::errors::NotFound("Grad variable not exist for variable %s", - var->Name())); - - auto& fwd_var = var->Var().Get(); - auto* grad_var = - var->GradVarBase()->MutableVar()->GetMutable(); - VLOG(6) << "init loss grad:" << var->GradVarBase()->Name() - << " as stop_gradient false"; - var->GradVarBase()->InnerSetOverridedStopGradient(false); - auto* dev_ctx = platform::DeviceContextPool::Instance().Get(fwd_var.place()); - grad_var->Resize(fwd_var.dims()); - grad_var->mutable_data(fwd_var.place(), fwd_var.type()); - operators::math::set_constant(*dev_ctx, grad_var, 1.0); + VLOG(3) << "Init node of backward"; + + PADDLE_ENFORCE_EQ( + var->HasGradVar(), true, + platform::errors::NotFound("Tensor %s has no gradient", var->Name())); + + auto& fwd_var = var->Var().Get(); + auto* grad_var = + var->GradVarBase()->MutableVar()->GetMutable(); + VLOG(6) << "init loss grad:" << var->GradVarBase()->Name() + << " as stop_gradient false"; + var->GradVarBase()->InnerSetOverridedStopGradient(false); + auto* dev_ctx = + platform::DeviceContextPool::Instance().Get(fwd_var.place()); + if (grad_tensor == nullptr) { + grad_var->Resize(fwd_var.dims()); + grad_var->mutable_data(fwd_var.place(), fwd_var.type()); + operators::math::set_constant(*dev_ctx, grad_var, 1.0); + } else { + paddle::framework::TensorCopy( + grad_tensor->Var().Get(), fwd_var.place(), + *dev_ctx, grad_var); + } + + init_nodes_.push_back(init_node); + } } void BasicEngine::CheckBackwardInputs(const OpBase& op) { @@ -224,8 +249,10 @@ void BasicEngine::PrepareDeps() { std::queue q; std::unordered_set visited; - q.push(init_node_.get()); - visited.insert(init_node_.get()); + for (size_t i = 0; i < init_nodes_.size(); ++i) { + q.push(init_nodes_[i].get()); + visited.insert(init_nodes_[i].get()); + } while (!q.empty()) { auto* cur_node = q.front(); @@ -276,14 +303,16 @@ static std::shared_ptr> CallGradientHooks( } void BasicEngine::Execute() { - if (init_node_ == nullptr) { + if (init_nodes_.empty()) { return; } PrepareDeps(); // Start execute Computation graph std::queue> q; - q.push(std::move(init_node_)); + for (size_t i = 0; i < init_nodes_.size(); ++i) { + q.push(std::move(init_nodes_[i])); + } size_t op_num = 0; @@ -505,7 +534,7 @@ void BasicEngine::Execute() { } void BasicEngine::Clear() { - init_node_.reset(); + init_nodes_.clear(); node_deps_.clear(); accumulators_.clear(); accumulators_with_grad_node_.clear(); diff --git a/paddle/fluid/imperative/basic_engine.h b/paddle/fluid/imperative/basic_engine.h index a2ad8b5f8aa..49761a8df0b 100644 --- a/paddle/fluid/imperative/basic_engine.h +++ b/paddle/fluid/imperative/basic_engine.h @@ -30,7 +30,9 @@ class OpBase; class BasicEngine : public Engine { public: - void Init(VarBase* var, bool retain_graph = false); + void Init(const std::vector>& tensors, + const std::vector>& grad_tensors, + bool retain_graph = false); void Execute() override; @@ -46,7 +48,7 @@ class BasicEngine : public Engine { void Clear(); private: - std::shared_ptr init_node_; + std::vector> init_nodes_; std::unordered_map node_deps_; // The input and output of Inplace op are the same. If only `var` is used // as the key, then the input and output of inplace op must be gradient diff --git a/paddle/fluid/imperative/tests/test_hooks.cc b/paddle/fluid/imperative/tests/test_hooks.cc index 9b75fac0ca5..8c907b98906 100644 --- a/paddle/fluid/imperative/tests/test_hooks.cc +++ b/paddle/fluid/imperative/tests/test_hooks.cc @@ -92,8 +92,10 @@ TEST(TestHooks, TestGradVarLeafBackwardHook) { ASSERT_EQ(out->GradVarBase()->GradOpNum(), 1UL); // 3. backward + std::vector> tensors{out}; + std::vector> grad_tensors{nullptr}; BasicEngine engine; - engine.Init(out.get()); + engine.Init(tensors, grad_tensors); engine.Execute(); framework::LoDTensor x_grad; @@ -191,8 +193,10 @@ void GradVarLeafBackwardHookWithGradAccmulatedTest() { ASSERT_EQ(out->GradVarBase()->GradOpNum(), 1UL); // 3. backward + std::vector> tensors{out}; + std::vector> grad_tensors{nullptr}; BasicEngine engine; - engine.Init(out.get()); + engine.Init(tensors, grad_tensors); engine.Execute(); framework::LoDTensor x_grad; diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc index 9e3b0ea5df6..76de413b3e6 100644 --- a/paddle/fluid/imperative/tests/test_tracer.cc +++ b/paddle/fluid/imperative/tests/test_tracer.cc @@ -250,7 +250,10 @@ TEST(test_tracer, test_trace_op_with_multi_device_inputs) { tracer.TraceOp("reduce_sum", reduce_in, reduce_out, reduce_attr_map, gpu_place, true); imperative::BasicEngine engine; - engine.Init(reduce_sum_out.get()); + + std::vector> tensors{reduce_sum_out}; + std::vector> grad_tensors{nullptr}; + engine.Init(tensors, grad_tensors); engine.Execute(); framework::LoDTensor rlt; @@ -376,8 +379,10 @@ TEST(test_tracer, test_var_without_grad_var) { ASSERT_EQ(y_in->GradVarBase()->GradOpNum(), 0UL); ASSERT_EQ(vout->GradVarBase()->GradOpNum(), 1UL); + std::vector> tensors{vout}; + std::vector> grad_tensors{nullptr}; imperative::BasicEngine engine; - engine.Init(vout.get()); + engine.Init(tensors, grad_tensors); engine.Execute(); // check the grad diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index c1c1387a84c..4ab507fe367 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -805,6 +805,7 @@ void BindImperative(py::module *m_ptr) { Bump the version whenever the Tensor is modified through an inplace operation. )DOC") .def("numpy", + [](imperative::VarBase &self) -> py::array { const auto &tensor = self.MutableVar()->Get(); @@ -1003,18 +1004,6 @@ void BindImperative(py::module *m_ptr) { print(x.stop_gradient) # True print(x.grad) # None )DOC") - .def("_run_backward", - [](imperative::VarBase &self, const imperative::Tracer &tracer, - bool retain_graph) { - // TODO(jiabin): when we impl more backward execution we can - // select them - auto *engine = tracer.GetEngine(); - engine->Init(&self, retain_graph); - VLOG(3) << "Start backward"; - engine->Execute(); - VLOG(3) << "Finish backward"; - }, - py::call_guard()) .def("_grad_name", &imperative::VarBase::GradVarName) .def("_grad_value", [](imperative::VarBase &self) { @@ -1549,6 +1538,19 @@ void BindImperative(py::module *m_ptr) { }, py::call_guard()); + m.def( + "dygraph_run_backward", + [](const std::vector> &tensors, + const std::vector> &grad_tensors, + bool retain_graph, const imperative::Tracer &tracer) { + auto *engine = tracer.GetEngine(); + engine->Init(tensors, grad_tensors, retain_graph); + VLOG(3) << "Start backward"; + engine->Execute(); + VLOG(3) << "Finish backward"; + }, + py::call_guard()); + #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_XPU_BKCL) py::class_ 0, "{} connot be empyt".format(name) + for each_var in in_out_list: + assert isinstance( + each_var, paddle. + Tensor), "Elements of {} must be paddle.Tensor".format(name) + return in_out_list + else: + assert isinstance( + in_out_list, + paddle.Tensor), "{} must be Tensor or list of Tensor".format( + name) + return [in_out_list] + + tensors = check_tensors(tensors, "tensors") + + assert len(tensors) == len( + set(tensors) + ), "The argument 'tensors' of paddle.autograd.backward contains duplicate paddle.Tensor object." + + if grad_tensors is not None: + if not isinstance(grad_tensors, (list, tuple)): + grad_tensors = [grad_tensors] + + for each_tensor in grad_tensors: + if each_tensor is not None: + assert isinstance( + each_tensor, paddle.Tensor + ), "The argument 'grad_tensors' of paddle.autograd.backward is invalid, it can be 'None', 'paddle.Tensor' or 'list[None/paddle.Tensor]'." + else: + grad_tensors = [None] * len(tensors) + + if len(grad_tensors) > 0: + assert len(tensors) == len( + grad_tensors), "The length of grad_tensors must be equal to tensors" + + assert isinstance(retain_graph, bool), "retain_graph must be True or False" + + core.dygraph_run_backward(tensors, grad_tensors, retain_graph, + framework._dygraph_tracer()) diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py index 08d58e0c808..be5d9ac5831 100644 --- a/python/paddle/fluid/dygraph/base.py +++ b/python/paddle/fluid/dygraph/base.py @@ -26,6 +26,7 @@ import logging from ..data_feeder import convert_dtype import warnings from ..framework import _get_paddle_place +import paddle __all__ = [ 'no_grad', 'no_grad_', 'grad', 'guard', 'enable_dygraph', 'disable_dygraph', diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index e565552632f..ac594709867 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -163,7 +163,7 @@ def monkey_patch_varbase(): framework._current_expected_place()) @framework.dygraph_only - def backward(self, retain_graph=False): + def backward(self, grad_tensor=None, retain_graph=False): """ Run backward of current Graph which starts from current Tensor. @@ -172,17 +172,22 @@ def monkey_patch_varbase(): You can clear gradient by ``Tensor.clear_grad()`` . Args: + grad_tensor(Tensor, optional): initial gradient values of the current Tensor. If `grad_tensor` is None, + the initial gradient values of the current Tensor would be Tensor filled with 1.0; + if `grad_tensor` is not None, it must have the same length as the current Tensor. + Teh default value is None. + retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter :code:`retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient. Defaults to False. - Returns: NoneType: None Examples: .. code-block:: python + import paddle x = paddle.to_tensor(5., stop_gradient=False) for i in range(5): y = paddle.pow(x, 4.0) @@ -198,15 +203,36 @@ def monkey_patch_varbase(): print("{}".format(x.grad)) # 0. + grad_tensor=paddle.to_tensor(2.) + for i in range(5): + y = paddle.pow(x, 4.0) + y.backward(grad_tensor) + print("{}: {}".format(i, x.grad)) + # 0: [1000.] + # 1: [2000.] + # 2: [3000.] + # 3: [4000.] + # 4: [5000.] + """ if framework.in_dygraph_mode(): + if grad_tensor is not None: + assert isinstance( + grad_tensor, paddle. + Tensor), "The type of grad_tensot must be paddle.Tensor" + assert grad_tensor.shape == self.shape, \ + "Tensor shape not match, Tensor of grad_tensor [ {} ] with shape {} mismatch Tensor [ {} ] with shape {}".format( + grad_tensor.name, grad_tensor.shape, self.name, self.shape) + if paddle.is_compiled_with_xpu(): # TODO(liuyuhui): Currently only for xpu. Will be removed in the future. scaled_loss = scale_loss(self) - scaled_loss._run_backward(framework._dygraph_tracer(), - retain_graph) + core.dygraph_run_backward([scaled_loss], [grad_tensor], + retain_graph, + framework._dygraph_tracer()) else: - self._run_backward(framework._dygraph_tracer(), retain_graph) + core.dygraph_run_backward([self], [grad_tensor], retain_graph, + framework._dygraph_tracer()) else: raise ValueError( "Variable.backward() is only available in DyGraph mode") diff --git a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py new file mode 100644 index 00000000000..a7472e7ffd7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py @@ -0,0 +1,119 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np + +import paddle +import paddle.fluid.dygraph as dg +from op_test import OpTest + + +class TestTensorBackward(unittest.TestCase): + def setUp(self): + self._dtypes = ["float32", "float64"] + self._places = [paddle.CPUPlace()] + if paddle.is_compiled_with_cuda(): + self._places.append(paddle.CUDAPlace(0)) + + def test_tensor_backward(self): + for dtype in self._dtypes: + x = np.random.random([2, 100]).astype(dtype) + y = np.random.random([100, 2]).astype(dtype) + z = np.matmul(x, y) + grad = np.random.random(z.shape).astype(dtype) + for place in self._places: + with dg.guard(place): + x_tensor = paddle.to_tensor(x, stop_gradient=False) + y_tensor = paddle.to_tensor(y) + z_tensor = paddle.matmul(x_tensor, y_tensor) + + grad_tensor = paddle.to_tensor(grad) + z_tensor.backward(grad_tensor) + + x_grad = np.matmul(grad, y.T) + + self.assertTrue(np.allclose(x_grad, x_tensor.grad)) + + +class TestBackwardAPI(unittest.TestCase): + def setUp(self): + self._dtypes = ["float32", "float64"] + self._places = [paddle.CPUPlace()] + if paddle.is_compiled_with_cuda(): + self._places.append(paddle.CUDAPlace(0)) + + def test_backward_api(self): + for dtype in self._dtypes: + x = np.random.random([2, 2]).astype(dtype) + y = np.random.random([2, 2]).astype(dtype) + z = np.matmul(x, y) + grad = np.random.random(z.shape).astype(dtype) + for place in self._places: + with dg.guard(place): + x_tensor = paddle.to_tensor(x, stop_gradient=False) + y_tensor = paddle.to_tensor(y) + z_tensor1 = paddle.matmul(x_tensor, y_tensor) + z_tensor2 = paddle.matmul(x_tensor, y_tensor) + + grad_tensor = paddle.to_tensor(grad) + paddle.autograd.backward([z_tensor1, z_tensor2], + [grad_tensor, grad_tensor], True) + + x_grad = np.matmul(grad, y.T) + + self.assertTrue(np.allclose(x_grad * 2, x_tensor.grad)) + + def test_backward_single_tensor(self): + for dtype in self._dtypes: + x = np.random.random([2, 2]).astype(dtype) + y = np.random.random([2, 2]).astype(dtype) + z = np.matmul(x, y) + grad = np.random.random(z.shape).astype(dtype) + for place in self._places: + with dg.guard(place): + x_tensor = paddle.to_tensor(x, stop_gradient=False) + y_tensor = paddle.to_tensor(y) + z_tensor1 = paddle.matmul(x_tensor, y_tensor) + + grad_tensor = paddle.to_tensor(grad) + paddle.autograd.backward(z_tensor1, grad_tensor, True) + + x_grad = np.matmul(grad, y.T) + + self.assertTrue(np.allclose(x_grad, x_tensor.grad)) + + def test_backward_none_grad_tensor(self): + for dtype in self._dtypes: + x = np.random.random([2, 2]).astype(dtype) + y = np.random.random([2, 2]).astype(dtype) + z = np.matmul(x, y) + grad = np.ones(z.shape).astype(dtype) + for place in self._places: + with dg.guard(place): + x_tensor = paddle.to_tensor(x, stop_gradient=False) + y_tensor = paddle.to_tensor(y) + z_tensor1 = paddle.matmul(x_tensor, y_tensor) + + paddle.autograd.backward(z_tensor1, None) + + x_grad = np.matmul(grad, y.T) + + self.assertTrue(np.allclose(x_grad, x_tensor.grad)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/setup.py.in b/python/setup.py.in index 73c773bab49..e4532b3e55d 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -216,6 +216,7 @@ packages=['paddle', 'paddle.static.amp', 'paddle.tensor', 'paddle.onnx', + 'paddle.autograd', ] with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f: -- GitLab