Add inner register backward hook method for Tensor (#32171)

* add register backward hook method * add leaf grad accumullated test

Add inner register backward hook method for Tensor (#32171)
* add register backward hook method * add leaf grad accumullated test
7ba85aca · Chen Weihang · GitHub · f3e49c40 · 7ba85aca · 7ba85aca
9 changed file
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -284,15 +284,15 @@ static std::shared_ptr<NameVarMap<VariableWrapper>> CallGradientHooks(
  for (const auto& pair : bwd_ins) {
    for (size_t i = 0; i < pair.second.size(); ++i) {
      auto& var = pair.second[i];
-      if (var->HasHook()) {
+      if (var->HasVariableWrapperHook()) {
        if (tmp_ins_ptr == nullptr) {
          tmp_ins_ptr = std::make_shared<NameVarMap<VariableWrapper>>(bwd_ins);
        }
-        VLOG(3) << "Call " << var->GetHooks().size() << " hooks of " << op_type
-                << "'s input `" << pair.first << "`'s var `" << var->Name()
-                << "`.";
+        VLOG(3) << "Call " << var->GetVariableWrapperHooks().size()
+                << " hooks of " << op_type << "'s input `" << pair.first
+                << "`'s var `" << var->Name() << "`.";
        auto tmp_var = var;
-        for (const auto& hook_pair : var->GetHooks()) {
+        for (const auto& hook_pair : var->GetVariableWrapperHooks()) {
          tmp_var = (*hook_pair.second)(tmp_var);
        }
        (*tmp_ins_ptr)[pair.first][i] = tmp_var;

--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -467,14 +467,14 @@ void GradientAccumulator::CallGradientHooks() {
      platform::errors::PreconditionNotMet("Leaf Tensor's inner var "
                                           "is not initialized when "
                                           "call gradient hook."));
-  if (var_->HasHook()) {
-    VLOG(3) << "Call " << var_->GetHooks().size()
+  if (var_->HasVariableWrapperHook()) {
+    VLOG(3) << "Call " << var_->GetVariableWrapperHooks().size()
            << " hooks of leaf gradient accumulator's inner var `"
            << var_->Name() << "`.";
    auto tmp_var = inner_var_;
    VLOG(3) << "Input var " << var_->Name() << "'s hook size - "
-            << var_->GetHooks().size();
-    for (const auto& hook_pair : var_->GetHooks()) {
+            << var_->GetVariableWrapperHooks().size();
+    for (const auto& hook_pair : var_->GetVariableWrapperHooks()) {
      tmp_var = (*hook_pair.second)(tmp_var);
    }
    inner_var_ = tmp_var;
@@ -495,10 +495,10 @@ void GradientAccumulator::CallReduceHooks() {
                        "Only can call reduce hooks after the "
                        "gradient accumulation is completed in "
                        "current batch or across batchs."));
-  if (var_->HasMutableHook()) {
-    for (const auto& hook : var_->GetMutableHooks()) {
+  if (var_->HasVoidHook()) {
+    for (const auto& hook : var_->GetVoidHooks()) {
      VLOG(3) << "call gradient accumulator backward hooks.";
-      (*hook)(var_);
+      (*hook)();
    }
  }
 }

--- a/paddle/fluid/imperative/hooks.h
+++ b/paddle/fluid/imperative/hooks.h
@@ -23,32 +23,34 @@ namespace imperative {

 class VariableWrapper;

-/** [ Const VariableWrapper Hook: Pre hook functor of OpBase ]
+/** [ VariableWrapper Hook ]
 *
- * @brief This hook functor is executed before the grad OpBase is executed,
- *        taking the input of the current grad OpBase as input, and
- *        executing python hooks (user-defined) or C++ hooks (developer-defined)
- *        to achieve the purpose of custom operations on the interior VarBase
- *        gradient.
+ * @brief This hook functor is executed before the grad OpBase is executed or
+ *        after gradient accumulation completed in current batch.
+ *        1. For interior var, VariableWrapper Hook take the input of the
+ *        current grad OpBase as input.
+ *        2. For leaf var, VariableWrapper Hook take the inner_var_ of
+ *        GradientAccumulator as input.
 *
- * @note  This hook functor will not change the input gradient VarBase.
+ * @note  This hook functor will not change the input gradient VariableWrapper,
+ *        but if you copy the input VariableWrapper and change the value of
+ *        Variable in VariableWrapper, the value of input will also be changed,
+ *        because they shared same PlaceHolder.
 *
- * @note  [Why need to be OpBase `PreHook`, why not `PostHook`?]
+ * @note  [ Why need to be OpBase `PreHook`, why not `PostHook`? ]
 *
- *        1. We expect If set OpBase post hook, when the op executed end, the
+ *        We expect If set OpBase post hook, when the op executed end, the
 *        op's output gradient may not be the final state, because it may need
 *        other op's gradient output to accumulated to it. But before op can
 *        be executed, the gradient output must have been accumulated to final
 *        value.
- *        2. We don’t want the hook to change its input Tensor value, so now
- *        we can't call all hooks in GradAccumulator.
 *
- * @note  [Why only can be used for interior VarBase?]
+ * @note  [ Why Leaf gradient is special? ]
 *
 *        Because the leaf VarBase's GradVarBase has no GradOpNode, so leaf
 *        GradVarBase has no next OpBase to executed, so if need to deal with
- *        the leaf GradVarBase, cannot use this hook functor. For this case, we
- *        deal with by other inplace hook method.
+ *        the leaf GradVarBase, we should call hooks after gradient accumulation
+ *        completed.
 */
 class VariableWrapperHook {
 public:
@@ -57,34 +59,22 @@ class VariableWrapperHook {
      const std::shared_ptr<VariableWrapper>& var) = 0;
 };

-/** [ Inplace VariableWrapper Hook: Post hook functor of GradAccumulator ]
- *
- * @brief This hook functor is the Hook that operates on the current
- *        gradientafter the GradientAccumulator has accumulated the gradient.
- *        Leaf GradVarBase has no next OpBase, if we want to register hook
- *        for it, we also need to wait until the leaf GradVarBase accumulation
- *        is completed, so we can add post hook to GradientAccumulator.
- *
- * @note  This hook functor will change the grad VarBase value.
- *
- * @note  Only allow leaf VarBase hold call this hook functor.
- */
-class InplaceVariableWrapperHook {
- public:
-  virtual ~InplaceVariableWrapperHook() = default;
-  virtual void operator()(VariableWrapper* var) = 0;
-};
-
-class LambdaInplaceVariableWrapperHook : public InplaceVariableWrapperHook {
+class CppVariableWrapperHook : public VariableWrapperHook {
 public:
-  explicit LambdaInplaceVariableWrapperHook(
-      std::function<void(VariableWrapper*)>&& fn)
+  explicit CppVariableWrapperHook(
+      std::function<std::shared_ptr<VariableWrapper>(
+          const std::shared_ptr<VariableWrapper>&)>&& fn)
      : fn_(std::move(fn)) {}

-  void operator()(VariableWrapper* var) override { fn_(var); }
+  std::shared_ptr<VariableWrapper> operator()(
+      const std::shared_ptr<VariableWrapper>& var) override {
+    return fn_(var);
+  }

 private:
-  std::function<void(VariableWrapper*)> fn_;
+  std::function<std::shared_ptr<VariableWrapper>(
+      const std::shared_ptr<VariableWrapper>&)>
+      fn_;
 };

 }  // namespace imperative

--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -226,23 +226,25 @@ class VarBase {
  void BumpInplaceVersion();

  /* Hook related method: now only used for GradVarBase */
-  bool HasHook() const { return var_->HasHook(); }
+  bool HasVariableWrapperHook() const { return var_->HasVariableWrapperHook(); }

-  int64_t AddHook(std::shared_ptr<VariableWrapperHook>&& hook) {
-    return var_->AddHook(
+  int64_t AddVariableWrapperHook(std::shared_ptr<VariableWrapperHook>&& hook) {
+    return var_->AddVariableWrapperHook(
        std::forward<std::shared_ptr<VariableWrapperHook>>(hook));
  }

-  bool RemoveHook(const int64_t& hook_id) { return var_->RemoveHook(hook_id); }
+  bool RemoveVariableWrapperHook(const int64_t& hook_id) {
+    return var_->RemoveVariableWrapperHook(hook_id);
+  }

-  const std::map<int64_t, std::shared_ptr<VariableWrapperHook>>& GetHooks()
-      const {
-    return var_->GetHooks();
+  const std::map<int64_t, std::shared_ptr<VariableWrapperHook>>&
+  GetVariableWrapperHooks() const {
+    return var_->GetVariableWrapperHooks();
  }

-  void AddMutableHook(std::shared_ptr<InplaceVariableWrapperHook>&& hook) {
-    var_->AddMutableHook(
-        std::forward<std::shared_ptr<InplaceVariableWrapperHook>>(hook));
+  void AddVoidHook(std::shared_ptr<std::function<void()>>&& hook) {
+    var_->AddVoidHook(
+        std::forward<std::shared_ptr<std::function<void()>>>(hook));
  }

 private:

--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -310,9 +310,8 @@ Reducer::Reducer(const std::vector<std::shared_ptr<imperative::VarBase>> &vars,
  for (size_t global_var_index = 0; global_var_index < vars_.size();
       ++global_var_index) {
    auto var = vars_[global_var_index];
-    var->GradVarBase()->AddMutableHook(
-        std::make_shared<LambdaInplaceVariableWrapperHook>([=](
-            VariableWrapper *grad) { this->AddDistHook(global_var_index); }));
+    var->GradVarBase()->AddVoidHook(std::make_shared<std::function<void()>>(
+        [=]() { this->AddDistHook(global_var_index); }));
    var_index_map_[var->GradVarBase()->SharedVar().get()] = global_var_index;
  }


--- a/paddle/fluid/imperative/tests/test_hooks.cc
+++ b/paddle/fluid/imperative/tests/test_hooks.cc
@@ -37,6 +37,30 @@ namespace imperative {
 using vb_vector = std::vector<std::shared_ptr<imperative::VarBase>>;
 using var_pair = std::pair<std::string, vb_vector>;

+std::shared_ptr<imperative::VariableWrapper> DoubleHook(
+    const std::shared_ptr<imperative::VariableWrapper>& var) {
+  // 1. create out var
+  auto out_var = std::make_shared<imperative::VariableWrapper>(var->Name());
+  out_var->SetType(var->Type());
+  out_var->SetDataType(var->DataType());
+  out_var->SetForwardDataType(var->ForwardDataType());
+  out_var->InnerSetOverridedStopGradient(var->InnerOverridedStopGradient());
+
+  // 2. get input and output var's tensor
+  auto* out_tensor = out_var->MutableVar()->GetMutable<framework::LoDTensor>();
+  auto& tensor = var->Var().Get<framework::LoDTensor>();
+  out_tensor->Resize(tensor.dims());
+
+  // 3. double calc
+  auto* data = tensor.data<float>();
+  auto* out_data = out_tensor->mutable_data<float>(platform::CPUPlace());
+  for (int64_t i = 0; i < out_tensor->numel(); ++i) {
+    out_data[i] = data[i] * 2.0;
+  }
+
+  return out_var;
+}
+
 TEST(TestHooks, TestGradVarLeafBackwardHook) {
  // 1. prepare
  Tracer tracer;
@@ -73,16 +97,14 @@ TEST(TestHooks, TestGradVarLeafBackwardHook) {
  framework::AttributeMap mul_attr_map;
  mul_attr_map["use_mkldnn"] = false;

-  // add GradAccumulatorPostHook
-  x->GradVarBase()->AddMutableHook(
-      std::make_shared<LambdaInplaceVariableWrapperHook>(
-          [=](VariableWrapper* grad) {
-            auto* grad_tensor =
-                grad->MutableVar()->GetMutable<framework::LoDTensor>();
-            for (int i = 0; i < grad_tensor->numel(); ++i) {
-              grad_tensor->mutable_data<float>(place)[i] *= 2.0;
-            }
-          }));
+  // add VariableWrapper hook
+  x->GradVarBase()->AddVariableWrapperHook(
+      std::make_shared<imperative::CppVariableWrapperHook>(DoubleHook));
+
+  // add Void hook
+  int64_t hook_value = 0;
+  x->GradVarBase()->AddVoidHook(
+      std::make_shared<std::function<void()>>([&]() { hook_value = 10; }));

  // 2. forward
  tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true);
@@ -98,12 +120,15 @@ TEST(TestHooks, TestGradVarLeafBackwardHook) {
  engine.Init(tensors, grad_tensors);
  engine.Execute();

+  // verify VariableWrapper hook result
  framework::LoDTensor x_grad;
  framework::TensorCopySync(x->GradVar().Get<framework::LoDTensor>(), place,
                            &x_grad);
  for (int i = 0; i < x_grad.numel(); ++i) {
    ASSERT_EQ(x_grad.data<float>()[i], 8.0);
  }
+  // verify Void hook result
+  ASSERT_EQ(hook_value, 10);

  framework::LoDTensor y_grad;
  framework::TensorCopySync(y->GradVar().Get<framework::LoDTensor>(), place,
@@ -152,16 +177,14 @@ void GradVarLeafBackwardHookWithGradAccmulatedTest() {
  memory::Copy(place, mutable_z, place, src_data.data(),
               sizeof(float) * src_data.size());

-  // add ReduceBackwardHook
-  x->GradVarBase()->AddMutableHook(
-      std::make_shared<LambdaInplaceVariableWrapperHook>(
-          [=](VariableWrapper* grad) {
-            auto* grad_tensor =
-                grad->MutableVar()->GetMutable<framework::LoDTensor>();
-            for (int i = 0; i < grad_tensor->numel(); ++i) {
-              grad_tensor->mutable_data<float>(place)[i] *= 2.0;
-            }
-          }));
+  // add VariableWrapper hook
+  x->GradVarBase()->AddVariableWrapperHook(
+      std::make_shared<imperative::CppVariableWrapperHook>(DoubleHook));
+
+  // add Void hook
+  int64_t hook_value = 0;
+  x->GradVarBase()->AddVoidHook(
+      std::make_shared<std::function<void()>>([&]() { hook_value = 100; }));

  // 2. forward
  var_pair x_pair = var_pair("X", vb_vector(1, x));
@@ -199,12 +222,15 @@ void GradVarLeafBackwardHookWithGradAccmulatedTest() {
  engine.Init(tensors, grad_tensors);
  engine.Execute();

+  // verify VariableWrapper hook result
  framework::LoDTensor x_grad;
  framework::TensorCopySync(x->GradVar().Get<framework::LoDTensor>(), place,
                            &x_grad);
  for (int i = 0; i < x_grad.numel(); ++i) {
    ASSERT_EQ(x_grad.data<float>()[i], 16.0);
  }
+  // verify Void hook result
+  ASSERT_EQ(hook_value, 100);

  framework::LoDTensor y_grad;
  framework::TensorCopySync(y->GradVar().Get<framework::LoDTensor>(), place,

--- a/paddle/fluid/imperative/variable_wrapper.h
+++ b/paddle/fluid/imperative/variable_wrapper.h
@@ -220,35 +220,35 @@ class VariableWrapper {
  }

  /* Hook related methods */
-  bool HasHook() const { return !hooks_.empty(); }
+  bool HasVariableWrapperHook() const { return !var_hooks_.empty(); }

-  bool HasMutableHook() const { return !mutable_hooks_.empty(); }
-
-  int64_t AddHook(std::shared_ptr<VariableWrapperHook>&& hook) {
-    hooks_.emplace(next_hook_id_, std::move(hook));
+  int64_t AddVariableWrapperHook(std::shared_ptr<VariableWrapperHook>&& hook) {
+    var_hooks_.emplace(next_hook_id_, std::move(hook));
    return next_hook_id_++;
  }

-  bool RemoveHook(const int64_t& hook_id) {
-    auto remove_cnt = hooks_.erase(hook_id);
+  bool RemoveVariableWrapperHook(const int64_t& hook_id) {
+    auto remove_cnt = var_hooks_.erase(hook_id);
    if (remove_cnt == 0) {
      return false;
    }
    return true;
  }

-  const std::map<int64_t, std::shared_ptr<VariableWrapperHook>>& GetHooks()
-      const {
-    return hooks_;
+  const std::map<int64_t, std::shared_ptr<VariableWrapperHook>>&
+  GetVariableWrapperHooks() const {
+    return var_hooks_;
  }

-  void AddMutableHook(std::shared_ptr<InplaceVariableWrapperHook>&& hook) {
-    mutable_hooks_.emplace_back(std::move(hook));
+  bool HasVoidHook() const { return !void_hooks_.empty(); }
+
+  void AddVoidHook(std::shared_ptr<std::function<void()>>&& hook) {
+    void_hooks_.emplace_back(std::move(hook));
  }

-  const std::vector<std::shared_ptr<InplaceVariableWrapperHook>>&
-  GetMutableHooks() const {
-    return mutable_hooks_;
+  const std::vector<std::shared_ptr<std::function<void()>>>& GetVoidHooks()
+      const {
+    return void_hooks_;
  }

 private:
@@ -319,14 +319,19 @@ class VariableWrapper {
  // isn't need
  bool is_empty_{false};

-  // NOTE(chenweihang): only grad var can hold hooks now
+  // NOTE(chenweihang): only grad var will hold hooks now
  int64_t next_hook_id_{0};
-  // Hooks used to register hook for grad var, support adding and removing,
+  // [ Hooks with VariableWrapper as input and output ]
+  // NOTE: Now registered for grad var, support adding and removing,
  // key is the accumulated int64_t value
-  std::map<int64_t, std::shared_ptr<VariableWrapperHook>> hooks_;
-  // Hooks executed after the execution of the entire backward process is over,
-  // currently only supported for reducing in distributed training
-  std::vector<std::shared_ptr<InplaceVariableWrapperHook>> mutable_hooks_;
+  // NOTE: Var hook need to support removing, so need hook id
+  std::map<int64_t, std::shared_ptr<VariableWrapperHook>> var_hooks_;
+  // [ Hooks without input and output ]
+  // NOTE: Now registered after the execution of the entire backward
+  // process is over, currently only used for reducing in distributed
+  // training
+  // NOTE: Now no need to support remove void hook
+  std::vector<std::shared_ptr<std::function<void()>>> void_hooks_;
 };

 }  // namespace imperative

--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -1069,20 +1069,58 @@ void BindImperative(py::module *m_ptr) {
      .def("_register_grad_hook",
           [](imperative::VarBase &self, const py::handle &hook) {
             PADDLE_ENFORCE_EQ(
-                 self.HasGradVar(), true,
+                 !self.OverridedStopGradient() && self.HasGradVar(), true,
                 platform::errors::InvalidArgument(
-                     "Cannot register hook on a tensor without gradient."));
-             return self.GradVarBase()->AddHook(
+                     "Cannot register gradient hook on a Tensor that stop "
+                     "gradient or without gradient."));
+             return self.GradVarBase()->AddVariableWrapperHook(
                 std::make_shared<PyVariableWrapperHook>(hook.ptr()));
           })
      .def("_remove_grad_hook",
           [](imperative::VarBase &self, int64_t hook_id) {
             PADDLE_ENFORCE_EQ(
-                 self.HasGradVar(), true,
+                 !self.OverridedStopGradient() && self.HasGradVar(), true,
                 platform::errors::InvalidArgument(
-                     "Cannot remove hook on a tensor without gradient."));
-             return self.GradVarBase()->RemoveHook(hook_id);
+                     "Cannot remove gradient hook on a Tensor that stop "
+                     "gradient or without gradient."));
+             return self.GradVarBase()->RemoveVariableWrapperHook(hook_id);
           })
+      .def("_register_backward_hook",
+           [](imperative::VarBase &self, const py::handle &hook) {
+             PADDLE_ENFORCE_EQ(
+                 self.IsLeaf(), true,
+                 platform::errors::InvalidArgument(
+                     "Only can register backward hook for leaf Tensor."));
+             PADDLE_ENFORCE_EQ(
+                 !self.OverridedStopGradient() && self.HasGradVar(), true,
+                 platform::errors::InvalidArgument(
+                     "Cannot register backward hook on a Tensor that stop "
+                     "gradient or without gradient."));
+             auto py_func = PyObjectCast<std::function<void()>>(hook.ptr());
+             self.GradVarBase()->AddVoidHook(
+                 std::make_shared<std::function<void()>>(py_func));
+           },
+           R"DOC(
+             Registers a backward hook for current Tensor.
+
+             This hook will be called every time the gradient of current Tensor has been fully calculated.
+
+             There are two differences with `_register_grad_hook`:
+             1. This backward hook will be executed after the gradient accumulation completed across batchs,
+                but the hook registered by `_register_grad_hook` will be executed the gradient accumulation
+                completed in current batch.
+             2. This backward hook function should have the following signature:
+
+                  hook() -> None
+
+                It requires no input and no return value.
+
+             Args:
+                 hook(function): A backward hook to be registered for Tensor.gradient
+
+             Returns:
+                 None
+           )DOC")
      .def("cpu",
           [](const std::shared_ptr<imperative::VarBase> &self) {
             if (platform::is_cpu_place(self->Place())) {
@@ -1301,28 +1339,22 @@ void BindImperative(py::module *m_ptr) {
                    &imperative::VarBase::SetOverridedStopGradient)
      .def_property("persistable", &imperative::VarBase::Persistable,
                    &imperative::VarBase::SetPersistable)
-      .def_property_readonly("shape",
-                             [](imperative::VarBase &self) {
-                               if (self.Var().IsType<framework::LoDTensor>()) {
-                                 return framework::vectorize<int>(
-                                     self.Var()
-                                         .Get<framework::LoDTensor>()
-                                         .dims());
-                               } else if (self.Var()
-                                              .IsType<
-                                                  framework::SelectedRows>()) {
-                                 return framework::vectorize<int>(
-                                     self.Var()
-                                         .Get<framework::SelectedRows>()
-                                         .value()
-                                         .dims());
-                               } else {
-                                 VLOG(2) << "It is meaningless to get shape of "
-                                            "variable type "
-                                         << GetTypeName(self);
-                                 return std::vector<int>();
-                               }
-                             })
+      .def_property_readonly(
+          "shape",
+          [](imperative::VarBase &self) {
+            if (self.Var().IsType<framework::LoDTensor>()) {
+              return framework::vectorize<int>(
+                  self.Var().Get<framework::LoDTensor>().dims());
+            } else if (self.Var().IsType<framework::SelectedRows>()) {
+              return framework::vectorize<int>(
+                  self.Var().Get<framework::SelectedRows>().value().dims());
+            } else {
+              VLOG(2) << "It is meaningless to get shape of "
+                         "variable type "
+                      << GetTypeName(self);
+              return std::vector<int>();
+            }
+          })
      .def_property_readonly("is_leaf", &imperative::VarBase::IsLeaf,
                             R"DOC(
      Whether a Tensor is leaf Tensor.

--- a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
@@ -178,8 +178,9 @@ class TestTensorRegisterHook(unittest.TestCase):
        # register hook and removed
        run_double_hook_for_leaf_var(lambda grad: grad * 2, removed=True)

-    def test_hook_for_accumulated_grad(self):
-        def run_double_hook_for_accumulated_grad(double_hook, removed=False):
+    def test_hook_for_accumulated_grad_interior_var(self):
+        def run_double_hook_for_accumulated_grad_interior_var(double_hook,
+                                                              removed=False):
            for device in self.devices:
                paddle.set_device(device)

@@ -227,9 +228,50 @@ class TestTensorRegisterHook(unittest.TestCase):
                                   if not removed else base_grad))

        # register hook
-        run_double_hook_for_accumulated_grad(lambda grad: grad * 2)
+        run_double_hook_for_accumulated_grad_interior_var(lambda grad: grad * 2)
        # register hook and removed
-        run_double_hook_for_accumulated_grad(
+        run_double_hook_for_accumulated_grad_interior_var(
+            lambda grad: grad * 2, removed=True)
+
+    def test_hook_for_accumulated_grad_leaf_var(self):
+        def run_double_hook_for_accumulated_grad_leaf_var(double_hook,
+                                                          removed=False):
+            for device in self.devices:
+                paddle.set_device(device)
+
+                x = paddle.to_tensor([0., 1., 2., 4.])
+                x.stop_gradient = False
+
+                helper = x.register_hook(double_hook)
+
+                y = paddle.to_tensor([4., 5., 6., 7.])
+                z = paddle.to_tensor([1., 2., 3., 4.])
+                y.stop_gradient = False
+                z.stop_gradient = False
+
+                o1 = x + y
+                o2 = x + z
+                o1.stop_gradient = False
+                o2.stop_gradient = False
+
+                o = o1.matmul(o2)
+
+                # remove hook before backward
+                if removed:
+                    helper.remove()
+
+                o.backward()
+
+                base_grad = np.array([5., 9., 13., 19.])
+                # x.grad is changed by x.hook
+                self.assertTrue(
+                    np.array_equal(x.grad, base_grad * 2
+                                   if not removed else base_grad))
+
+        # register hook
+        run_double_hook_for_accumulated_grad_leaf_var(lambda grad: grad * 2)
+        # register hook and removed
+        run_double_hook_for_accumulated_grad_leaf_var(
            lambda grad: grad * 2, removed=True)

    def test_hook_in_model(self):
@@ -409,5 +451,54 @@ class TestTensorRegisterHook(unittest.TestCase):
                x.register_hook(lambda grad: grad * 2)


+HOOK_INIT_VALUE = 10
+HOOK_IS_CALLED = False
+
+
+def global_void_hook():
+    global HOOK_INIT_VALUE
+    global HOOK_IS_CALLED
+    HOOK_INIT_VALUE *= 2
+    HOOK_IS_CALLED = True
+
+
+class TestTensorRegisterBackwardHook(unittest.TestCase):
+    def setUp(self):
+        self.devices = ["cpu"]
+        if paddle.is_compiled_with_cuda():
+            self.devices.append("gpu")
+
+    def test_register_backward_hook(self):
+        global HOOK_INIT_VALUE
+        global HOOK_IS_CALLED
+        for device in self.devices:
+            x = paddle.to_tensor(5., stop_gradient=False)
+            x._register_backward_hook(global_void_hook)
+            for i in range(5):
+                y = paddle.pow(x, 4.0)
+                y.backward()
+
+            self.assertEqual(HOOK_INIT_VALUE, 320)
+            self.assertTrue(HOOK_IS_CALLED)
+
+            # reset initial value
+            HOOK_INIT_VALUE = 10
+            HOOK_IS_CALLED = False
+
+    def test_register_backward_hook_for_interior_var(self):
+        x = paddle.to_tensor(5., stop_gradient=False)
+        y = paddle.pow(x, 4.0)
+
+        with self.assertRaises(ValueError):
+            y._register_backward_hook(global_void_hook)
+
+    def test_register_backward_hook_for_var_without_gradient(self):
+        x = paddle.to_tensor(5.)
+        y = paddle.pow(x, 4.0)
+
+        with self.assertRaises(ValueError):
+            x._register_backward_hook(global_void_hook)
+
+
 if __name__ == '__main__':
    unittest.main()