From 83b953f56f68470cfb285d0c127a53681c32800f Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Thu, 1 Apr 2021 14:55:48 +0800
Subject: [PATCH] add custom init grad for backward function (#31540)

* add custom init grad for backward function

* add custom init grad for backward function

* handle when the grad_tensor is none

* handle when the grad_tensor is none

* fix the args type error on windows platform

* modify the args order and doc

* format code

* add grad_tensor to xpu

* modify the grad_tensor type check

* add paddle.backward api to support multi tensors gradient compute

* add paddle.backward api to support multi tensors gradient compute

* add paddle.atuograd module and backward api

* change tensor.backward func args

* modify tensor backward api

* remove create_graph intputs args

* add doc and examplex code for backward api

* when have the same tensor, throw error

* modify test Init func args

* modify the execute.Init func args in test files

* add paddle.autograd package in setup.py.in

* modify error msg, remove _run_backward method in class Tensor

* add test cases for backward api
---
 paddle/fluid/imperative/basic_engine.cc       | 115 ++++++++++-------
 paddle/fluid/imperative/basic_engine.h        |   6 +-
 paddle/fluid/imperative/tests/test_hooks.cc   |   8 +-
 paddle/fluid/imperative/tests/test_tracer.cc  |   9 +-
 paddle/fluid/pybind/imperative.cc             |  26 ++--
 python/paddle/__init__.py                     |   1 +
 python/paddle/autograd/__init__.py            |  22 ++++
 python/paddle/autograd/backward_mode.py       | 119 ++++++++++++++++++
 python/paddle/fluid/dygraph/base.py           |   1 +
 .../fluid/dygraph/varbase_patch_methods.py    |  36 +++++-
 .../tests/unittests/test_custom_grad_input.py | 119 ++++++++++++++++++
 python/setup.py.in                            |   1 +
 12 files changed, 397 insertions(+), 66 deletions(-)
 create mode 100644 python/paddle/autograd/__init__.py
 create mode 100644 python/paddle/autograd/backward_mode.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_custom_grad_input.py
diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index 9e46af9cb72..2a439a6f1ea 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -36,48 +36,73 @@ DECLARE_bool(sort_sum_gradient);
 namespace paddle {
 namespace imperative {
 
-void BasicEngine::Init(VarBase* var, bool retain_graph) {
+void BasicEngine::Init(
+    const std::vector<std::shared_ptr<VarBase>>& tensors,
+    const std::vector<std::shared_ptr<VarBase>>& grad_tensors,
+    bool retain_graph) {
   retain_graph_ = retain_graph;
-  init_node_ = var->GradVarBase()->GradNode();
-  PADDLE_ENFORCE_EQ(var->GradVarBase()->GraphIsFreed(), false,
-                    platform::errors::Unavailable(
-                        "%s trying to backward through the same graph a second "
-                        "time, but this graph have already been freed. Please "
-                        "specify Tensor.backward(retain_graph=True) when "
-                        "calling backward at the first time.",
-                        var->Name()));
-
-  if (!retain_graph) {
-    VLOG(5) << "Clear the auto-grad graph from grad var " << var->Name()
-            << " because of retain_graph=False when calling backward";
-    var->GradVarBase()->SetGraphIsFreed(true);
-    var->GradVarBase()->ClearGradNode();
-  }
 
-  if (init_node_ == nullptr || var->OverridedStopGradient()) {
-    VLOG(3) << "Skip auto grad since there is no grad op for var or loss is "
-               "stop_gradient=True: "
-            << var->Name();
-    return;
-  }
+  PADDLE_ENFORCE_EQ(
+      tensors.size(), grad_tensors.size(),
+      platform::errors::Unavailable(
+          "The size of tensors do not equal the size of grad_tensors,"
+          "the size of tensors is %s, but the size of grad_tensors is %s.",
+          tensors.size(), grad_tensors.size()));
+
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    auto var = tensors[i];
+    auto grad_tensor = grad_tensors[i];
+
+    auto init_node = var->GradVarBase()->GradNode();
+    PADDLE_ENFORCE_EQ(
+        var->GradVarBase()->GraphIsFreed(), false,
+        platform::errors::Unavailable(
+            "%s trying to backward through the same graph a second "
+            "time, but this graph have already been freed. Please "
+            "specify Tensor.backward(retain_graph=True) when "
+            "calling backward at the first time.",
+            var->Name()));
+
+    if (!retain_graph) {
+      VLOG(5) << "Clear the auto-grad graph from grad var " << var->Name()
+              << " because of retain_graph=False when calling backward";
+      var->GradVarBase()->SetGraphIsFreed(true);
+      var->GradVarBase()->ClearGradNode();
+    }
 
-  VLOG(3) << "Init first node of backward";
+    if (init_node == nullptr || var->OverridedStopGradient()) {
+      VLOG(3) << "Skip auto grad since there is no grad op for var or loss is "
+                 "stop_gradient=True: "
+              << var->Name();
+      continue;
+    }
 
-  PADDLE_ENFORCE_EQ(
-      var->HasGradVar(), true,
-      platform::errors::NotFound("Grad variable not exist for variable %s",
-                                 var->Name()));
-
-  auto& fwd_var = var->Var().Get<framework::LoDTensor>();
-  auto* grad_var =
-      var->GradVarBase()->MutableVar()->GetMutable<framework::LoDTensor>();
-  VLOG(6) << "init loss grad:" << var->GradVarBase()->Name()
-          << " as stop_gradient false";
-  var->GradVarBase()->InnerSetOverridedStopGradient(false);
-  auto* dev_ctx = platform::DeviceContextPool::Instance().Get(fwd_var.place());
-  grad_var->Resize(fwd_var.dims());
-  grad_var->mutable_data(fwd_var.place(), fwd_var.type());
-  operators::math::set_constant(*dev_ctx, grad_var, 1.0);
+    VLOG(3) << "Init node of backward";
+
+    PADDLE_ENFORCE_EQ(
+        var->HasGradVar(), true,
+        platform::errors::NotFound("Tensor %s has no gradient", var->Name()));
+
+    auto& fwd_var = var->Var().Get<framework::LoDTensor>();
+    auto* grad_var =
+        var->GradVarBase()->MutableVar()->GetMutable<framework::LoDTensor>();
+    VLOG(6) << "init loss grad:" << var->GradVarBase()->Name()
+            << " as stop_gradient false";
+    var->GradVarBase()->InnerSetOverridedStopGradient(false);
+    auto* dev_ctx =
+        platform::DeviceContextPool::Instance().Get(fwd_var.place());
+    if (grad_tensor == nullptr) {
+      grad_var->Resize(fwd_var.dims());
+      grad_var->mutable_data(fwd_var.place(), fwd_var.type());
+      operators::math::set_constant(*dev_ctx, grad_var, 1.0);
+    } else {
+      paddle::framework::TensorCopy(
+          grad_tensor->Var().Get<framework::LoDTensor>(), fwd_var.place(),
+          *dev_ctx, grad_var);
+    }
+
+    init_nodes_.push_back(init_node);
+  }
 }
 
 void BasicEngine::CheckBackwardInputs(const OpBase& op) {
@@ -224,8 +249,10 @@ void BasicEngine::PrepareDeps() {
   std::queue<GradOpNode*> q;
   std::unordered_set<GradOpNode*> visited;
 
-  q.push(init_node_.get());
-  visited.insert(init_node_.get());
+  for (size_t i = 0; i < init_nodes_.size(); ++i) {
+    q.push(init_nodes_[i].get());
+    visited.insert(init_nodes_[i].get());
+  }
 
   while (!q.empty()) {
     auto* cur_node = q.front();
@@ -276,14 +303,16 @@ static std::shared_ptr<NameVarMap<VariableWrapper>> CallGradientHooks(
 }
 
 void BasicEngine::Execute() {
-  if (init_node_ == nullptr) {
+  if (init_nodes_.empty()) {
     return;
   }
 
   PrepareDeps();
   // Start execute Computation graph
   std::queue<std::shared_ptr<GradOpNode>> q;
-  q.push(std::move(init_node_));
+  for (size_t i = 0; i < init_nodes_.size(); ++i) {
+    q.push(std::move(init_nodes_[i]));
+  }
 
   size_t op_num = 0;
 
@@ -505,7 +534,7 @@ void BasicEngine::Execute() {
 }
 
 void BasicEngine::Clear() {
-  init_node_.reset();
+  init_nodes_.clear();
   node_deps_.clear();
   accumulators_.clear();
   accumulators_with_grad_node_.clear();
diff --git a/paddle/fluid/imperative/basic_engine.h b/paddle/fluid/imperative/basic_engine.h
index a2ad8b5f8aa..49761a8df0b 100644
--- a/paddle/fluid/imperative/basic_engine.h
+++ b/paddle/fluid/imperative/basic_engine.h
@@ -30,7 +30,9 @@ class OpBase;
 
 class BasicEngine : public Engine {
  public:
-  void Init(VarBase* var, bool retain_graph = false);
+  void Init(const std::vector<std::shared_ptr<VarBase>>& tensors,
+            const std::vector<std::shared_ptr<VarBase>>& grad_tensors,
+            bool retain_graph = false);
 
   void Execute() override;
 
@@ -46,7 +48,7 @@ class BasicEngine : public Engine {
   void Clear();
 
  private:
-  std::shared_ptr<GradOpNode> init_node_;
+  std::vector<std::shared_ptr<GradOpNode>> init_nodes_;
   std::unordered_map<GradOpNode*, size_t> node_deps_;
   // The input and output of Inplace op are the same. If only `var` is used
   // as the key, then the input and output of inplace op must be gradient
diff --git a/paddle/fluid/imperative/tests/test_hooks.cc b/paddle/fluid/imperative/tests/test_hooks.cc
index 9b75fac0ca5..8c907b98906 100644
--- a/paddle/fluid/imperative/tests/test_hooks.cc
+++ b/paddle/fluid/imperative/tests/test_hooks.cc
@@ -92,8 +92,10 @@ TEST(TestHooks, TestGradVarLeafBackwardHook) {
   ASSERT_EQ(out->GradVarBase()->GradOpNum(), 1UL);
 
   // 3. backward
+  std::vector<std::shared_ptr<imperative::VarBase>> tensors{out};
+  std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors{nullptr};
   BasicEngine engine;
-  engine.Init(out.get());
+  engine.Init(tensors, grad_tensors);
   engine.Execute();
 
   framework::LoDTensor x_grad;
@@ -191,8 +193,10 @@ void GradVarLeafBackwardHookWithGradAccmulatedTest() {
   ASSERT_EQ(out->GradVarBase()->GradOpNum(), 1UL);
 
   // 3. backward
+  std::vector<std::shared_ptr<imperative::VarBase>> tensors{out};
+  std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors{nullptr};
   BasicEngine engine;
-  engine.Init(out.get());
+  engine.Init(tensors, grad_tensors);
   engine.Execute();
 
   framework::LoDTensor x_grad;
diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc
index 9e3b0ea5df6..76de413b3e6 100644
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
@@ -250,7 +250,10 @@ TEST(test_tracer, test_trace_op_with_multi_device_inputs) {
   tracer.TraceOp("reduce_sum", reduce_in, reduce_out, reduce_attr_map,
                  gpu_place, true);
   imperative::BasicEngine engine;
-  engine.Init(reduce_sum_out.get());
+
+  std::vector<std::shared_ptr<imperative::VarBase>> tensors{reduce_sum_out};
+  std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors{nullptr};
+  engine.Init(tensors, grad_tensors);
   engine.Execute();
 
   framework::LoDTensor rlt;
@@ -376,8 +379,10 @@ TEST(test_tracer, test_var_without_grad_var) {
   ASSERT_EQ(y_in->GradVarBase()->GradOpNum(), 0UL);
   ASSERT_EQ(vout->GradVarBase()->GradOpNum(), 1UL);
 
+  std::vector<std::shared_ptr<imperative::VarBase>> tensors{vout};
+  std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors{nullptr};
   imperative::BasicEngine engine;
-  engine.Init(vout.get());
+  engine.Init(tensors, grad_tensors);
   engine.Execute();
 
   // check the grad
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index c1c1387a84c..4ab507fe367 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -805,6 +805,7 @@ void BindImperative(py::module *m_ptr) {
          Bump the version whenever the Tensor is modified through an inplace operation.
             )DOC")
       .def("numpy",
+
            [](imperative::VarBase &self) -> py::array {
              const auto &tensor =
                  self.MutableVar()->Get<framework::LoDTensor>();
@@ -1003,18 +1004,6 @@ void BindImperative(py::module *m_ptr) {
               print(x.stop_gradient) # True
               print(x.grad)          # None
        )DOC")
-      .def("_run_backward",
-           [](imperative::VarBase &self, const imperative::Tracer &tracer,
-              bool retain_graph) {
-             // TODO(jiabin): when we impl more backward execution we can
-             // select them
-             auto *engine = tracer.GetEngine();
-             engine->Init(&self, retain_graph);
-             VLOG(3) << "Start backward";
-             engine->Execute();
-             VLOG(3) << "Finish backward";
-           },
-           py::call_guard<py::gil_scoped_release>())
       .def("_grad_name", &imperative::VarBase::GradVarName)
       .def("_grad_value",
            [](imperative::VarBase &self) {
@@ -1549,6 +1538,19 @@ void BindImperative(py::module *m_ptr) {
       },
       py::call_guard<py::gil_scoped_release>());
 
+  m.def(
+      "dygraph_run_backward",
+      [](const std::vector<std::shared_ptr<imperative::VarBase>> &tensors,
+         const std::vector<std::shared_ptr<imperative::VarBase>> &grad_tensors,
+         bool retain_graph, const imperative::Tracer &tracer) {
+        auto *engine = tracer.GetEngine();
+        engine->Init(tensors, grad_tensors, retain_graph);
+        VLOG(3) << "Start backward";
+        engine->Execute();
+        VLOG(3) << "Finish backward";
+      },
+      py::call_guard<py::gil_scoped_release>());
+
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
     defined(PADDLE_WITH_XPU_BKCL)
   py::class_<imperative::ParallelContext,
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 8dabe19f57c..02725751cb6 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -44,6 +44,7 @@ import paddle.metric
 import paddle.device
 import paddle.regularizer
 import paddle.incubate
+import paddle.autograd
 
 # TODO: define alias in tensor and framework directory
 
diff --git a/python/paddle/autograd/__init__.py b/python/paddle/autograd/__init__.py
new file mode 100644
index 00000000000..8b3f3086a4a
--- /dev/null
+++ b/python/paddle/autograd/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..fluid.dygraph.base import grad  #DEFINE_ALIAS
+
+from . import backward_mode
+from .backward_mode import backward
+
+__all__ = ['grad']
+
+__all__ += backward_mode.__all__
diff --git a/python/paddle/autograd/backward_mode.py b/python/paddle/autograd/backward_mode.py
new file mode 100644
index 00000000000..96e4336abaa
--- /dev/null
+++ b/python/paddle/autograd/backward_mode.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid import core
+from paddle.fluid import framework
+import paddle
+__all__ = ['backward']
+
+
+@framework.dygraph_only
+def backward(tensors, grad_tensors=None, retain_graph=False):
+    """
+    Compute the backward gradients of given tensors.
+    
+    Args:
+        tensors(list of Tensors): the tensors which the gradient to be computed. The tensors can not contain the same tensor.
+
+        grad_tensors(list of Tensors of None, optional): the init gradients of the `tensors`` .If not None, it must have the same length with ``tensors`` ,
+            and if any of the elements is None, then the init gradient is the default value which is filled with 1.0. 
+            If None, all the gradients of the ``tensors`` is the default value which is filled with 1.0.
+            Defaults to None.
+
+        retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would
+            like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter
+            :code:`retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient.
+            Defaults to False.
+    
+    Returns:
+        NoneType: None
+
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            x = paddle.to_tensor([[1, 2], [3, 4]], dtype='float32', stop_gradient=False)
+            y = paddle.to_tensor([[3, 2], [3, 4]], dtype='float32')
+
+            grad_tensor1 = paddle.to_tensor([[1,2], [2, 3]], dtype='float32')
+            grad_tensor2 = paddle.to_tensor([[1,1], [1, 1]], dtype='float32')
+
+            z1 = paddle.matmul(x, y)
+            z2 = paddle.matmul(x, y)
+
+            paddle.autograd.backward([z1, z2], [grad_tensor1, grad_tensor2], True)
+            print(x.grad)
+            #[[12. 18.]
+            # [17. 25.]]
+
+            x.clear_grad()
+
+            paddle.autograd.backward([z1, z2], [grad_tensor1, None], True)
+            print(x.grad)
+            #[[12. 18.]
+            # [17. 25.]]
+
+            x.clear_grad()
+
+            paddle.autograd.backward([z1, z2])
+            print(x.grad)
+            #[[10. 14.]
+            # [10. 14.]]
+
+    """
+
+    def check_tensors(in_out_list, name):
+        assert in_out_list is not None, "{} should not be None".format(name)
+
+        if isinstance(in_out_list, (list, tuple)):
+            assert len(in_out_list) > 0, "{} connot be empyt".format(name)
+            for each_var in in_out_list:
+                assert isinstance(
+                    each_var, paddle.
+                    Tensor), "Elements of {} must be paddle.Tensor".format(name)
+            return in_out_list
+        else:
+            assert isinstance(
+                in_out_list,
+                paddle.Tensor), "{} must be Tensor or list of Tensor".format(
+                    name)
+            return [in_out_list]
+
+    tensors = check_tensors(tensors, "tensors")
+
+    assert len(tensors) == len(
+        set(tensors)
+    ), "The argument 'tensors' of paddle.autograd.backward contains duplicate paddle.Tensor object."
+
+    if grad_tensors is not None:
+        if not isinstance(grad_tensors, (list, tuple)):
+            grad_tensors = [grad_tensors]
+
+        for each_tensor in grad_tensors:
+            if each_tensor is not None:
+                assert isinstance(
+                    each_tensor, paddle.Tensor
+                ), "The argument 'grad_tensors' of paddle.autograd.backward is invalid, it can be 'None', 'paddle.Tensor' or 'list[None/paddle.Tensor]'."
+    else:
+        grad_tensors = [None] * len(tensors)
+
+    if len(grad_tensors) > 0:
+        assert len(tensors) == len(
+            grad_tensors), "The length of grad_tensors must be equal to tensors"
+
+    assert isinstance(retain_graph, bool), "retain_graph must be True or False"
+
+    core.dygraph_run_backward(tensors, grad_tensors, retain_graph,
+                              framework._dygraph_tracer())
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index 08d58e0c808..be5d9ac5831 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -26,6 +26,7 @@ import logging
 from ..data_feeder import convert_dtype
 import warnings
 from ..framework import _get_paddle_place
+import paddle
 
 __all__ = [
     'no_grad', 'no_grad_', 'grad', 'guard', 'enable_dygraph', 'disable_dygraph',
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index e565552632f..ac594709867 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -163,7 +163,7 @@ def monkey_patch_varbase():
                                       framework._current_expected_place())
 
     @framework.dygraph_only
-    def backward(self, retain_graph=False):
+    def backward(self, grad_tensor=None, retain_graph=False):
         """
         Run backward of current Graph which starts from current Tensor.
 
@@ -172,17 +172,22 @@ def monkey_patch_varbase():
         You can clear gradient by ``Tensor.clear_grad()`` .
 
         Args:
+            grad_tensor(Tensor, optional): initial gradient values of the current Tensor. If `grad_tensor` is None, 
+            the initial gradient values of the current Tensor would be Tensor filled with 1.0; 
+            if `grad_tensor` is not None, it must have the same length as the current Tensor.
+            Teh default value is None.
+
             retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would
                 like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter
                 :code:`retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient.
                 Defaults to False.
-
         Returns:
             NoneType: None
 
         Examples:
             .. code-block:: python
 
+                import paddle
                 x = paddle.to_tensor(5., stop_gradient=False)
                 for i in range(5):
                     y = paddle.pow(x, 4.0)
@@ -198,15 +203,36 @@ def monkey_patch_varbase():
                 print("{}".format(x.grad))
                 # 0.
 
+                grad_tensor=paddle.to_tensor(2.)
+                for i in range(5):
+                    y = paddle.pow(x, 4.0)
+                    y.backward(grad_tensor)
+                    print("{}: {}".format(i, x.grad))
+                # 0: [1000.]
+                # 1: [2000.]
+                # 2: [3000.]
+                # 3: [4000.]
+                # 4: [5000.]
+
         """
         if framework.in_dygraph_mode():
+            if grad_tensor is not None:
+                assert isinstance(
+                    grad_tensor, paddle.
+                    Tensor), "The type of grad_tensot must be paddle.Tensor"
+                assert grad_tensor.shape == self.shape, \
+                    "Tensor shape not match, Tensor of grad_tensor [ {} ] with shape {} mismatch Tensor [ {} ] with shape {}".format(
+                    grad_tensor.name, grad_tensor.shape, self.name, self.shape)
+
             if paddle.is_compiled_with_xpu():
                 # TODO(liuyuhui): Currently only for xpu. Will be removed in the future.
                 scaled_loss = scale_loss(self)
-                scaled_loss._run_backward(framework._dygraph_tracer(),
-                                          retain_graph)
+                core.dygraph_run_backward([scaled_loss], [grad_tensor],
+                                          retain_graph,
+                                          framework._dygraph_tracer())
             else:
-                self._run_backward(framework._dygraph_tracer(), retain_graph)
+                core.dygraph_run_backward([self], [grad_tensor], retain_graph,
+                                          framework._dygraph_tracer())
         else:
             raise ValueError(
                 "Variable.backward() is only available in DyGraph mode")
diff --git a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
new file mode 100644
index 00000000000..a7472e7ffd7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid.dygraph as dg
+from op_test import OpTest
+
+
+class TestTensorBackward(unittest.TestCase):
+    def setUp(self):
+        self._dtypes = ["float32", "float64"]
+        self._places = [paddle.CPUPlace()]
+        if paddle.is_compiled_with_cuda():
+            self._places.append(paddle.CUDAPlace(0))
+
+    def test_tensor_backward(self):
+        for dtype in self._dtypes:
+            x = np.random.random([2, 100]).astype(dtype)
+            y = np.random.random([100, 2]).astype(dtype)
+            z = np.matmul(x, y)
+            grad = np.random.random(z.shape).astype(dtype)
+            for place in self._places:
+                with dg.guard(place):
+                    x_tensor = paddle.to_tensor(x, stop_gradient=False)
+                    y_tensor = paddle.to_tensor(y)
+                    z_tensor = paddle.matmul(x_tensor, y_tensor)
+
+                    grad_tensor = paddle.to_tensor(grad)
+                    z_tensor.backward(grad_tensor)
+
+                    x_grad = np.matmul(grad, y.T)
+
+                    self.assertTrue(np.allclose(x_grad, x_tensor.grad))
+
+
+class TestBackwardAPI(unittest.TestCase):
+    def setUp(self):
+        self._dtypes = ["float32", "float64"]
+        self._places = [paddle.CPUPlace()]
+        if paddle.is_compiled_with_cuda():
+            self._places.append(paddle.CUDAPlace(0))
+
+    def test_backward_api(self):
+        for dtype in self._dtypes:
+            x = np.random.random([2, 2]).astype(dtype)
+            y = np.random.random([2, 2]).astype(dtype)
+            z = np.matmul(x, y)
+            grad = np.random.random(z.shape).astype(dtype)
+            for place in self._places:
+                with dg.guard(place):
+                    x_tensor = paddle.to_tensor(x, stop_gradient=False)
+                    y_tensor = paddle.to_tensor(y)
+                    z_tensor1 = paddle.matmul(x_tensor, y_tensor)
+                    z_tensor2 = paddle.matmul(x_tensor, y_tensor)
+
+                    grad_tensor = paddle.to_tensor(grad)
+                    paddle.autograd.backward([z_tensor1, z_tensor2],
+                                             [grad_tensor, grad_tensor], True)
+
+                    x_grad = np.matmul(grad, y.T)
+
+                    self.assertTrue(np.allclose(x_grad * 2, x_tensor.grad))
+
+    def test_backward_single_tensor(self):
+        for dtype in self._dtypes:
+            x = np.random.random([2, 2]).astype(dtype)
+            y = np.random.random([2, 2]).astype(dtype)
+            z = np.matmul(x, y)
+            grad = np.random.random(z.shape).astype(dtype)
+            for place in self._places:
+                with dg.guard(place):
+                    x_tensor = paddle.to_tensor(x, stop_gradient=False)
+                    y_tensor = paddle.to_tensor(y)
+                    z_tensor1 = paddle.matmul(x_tensor, y_tensor)
+
+                    grad_tensor = paddle.to_tensor(grad)
+                    paddle.autograd.backward(z_tensor1, grad_tensor, True)
+
+                    x_grad = np.matmul(grad, y.T)
+
+                    self.assertTrue(np.allclose(x_grad, x_tensor.grad))
+
+    def test_backward_none_grad_tensor(self):
+        for dtype in self._dtypes:
+            x = np.random.random([2, 2]).astype(dtype)
+            y = np.random.random([2, 2]).astype(dtype)
+            z = np.matmul(x, y)
+            grad = np.ones(z.shape).astype(dtype)
+            for place in self._places:
+                with dg.guard(place):
+                    x_tensor = paddle.to_tensor(x, stop_gradient=False)
+                    y_tensor = paddle.to_tensor(y)
+                    z_tensor1 = paddle.matmul(x_tensor, y_tensor)
+
+                    paddle.autograd.backward(z_tensor1, None)
+
+                    x_grad = np.matmul(grad, y.T)
+
+                    self.assertTrue(np.allclose(x_grad, x_tensor.grad))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/setup.py.in b/python/setup.py.in
index 73c773bab49..e4532b3e55d 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -216,6 +216,7 @@ packages=['paddle',
           'paddle.static.amp',
           'paddle.tensor',
           'paddle.onnx',
+          'paddle.autograd',
           ]
 
 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
-- 
GitLab