Support auto prune logic in eager mode (#38960)

* support test_auto_prune_partial * support rest of autoprune strategy in eager mode

Support auto prune logic in eager mode (#38960)
* support test_auto_prune_partial * support rest of autoprune strategy in eager mode
f81569e3 · Jiabin Yang · GitHub · 3115d005 · f81569e3 · f81569e3
15 changed file
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -1852,7 +1852,7 @@ static std::string GenerateGradNodeCCContents(
      "  %s\n"
      "  return outputs;\n";
  generated_grad_function_body = paddle::string::Sprintf(
-      BWD_RETURN_TEMPLATE, outs_size, generated_grad_function_body);
+      BWD_RETURN_TEMPLATE, in_vars.size(), generated_grad_function_body);

  // [Generation] Get Full Grad Function
  const char* GRAD_FUNCTION_TEMPLATE =

--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -103,7 +103,17 @@ void RunBackward(const std::vector<egr::EagerTensor>& tensors,
    VLOG(2) << "Out Rank of Tensor is slot: " << input_info.first
            << ", rank: " << input_info.second;
    // Get target GradNodeBase from target tensors
-    GradNodeBase* grad_node = auto_grad_meta->GetMutableGradNode().get();
+    auto shared_grad_node = auto_grad_meta->GetMutableGradNode();
+
+    if (shared_grad_node == nullptr || shared_grad_node.get() == nullptr ||
+        auto_grad_meta->StopGradient()) {
+      VLOG(3) << "Skip auto grad since there is no grad op for var or loss is "
+                 "stop_gradient=True: "
+              << tensor.name();
+      continue;
+    }
+
+    GradNodeBase* grad_node = shared_grad_node.get();

    // Prepare GradTensorHolder
    if (!node_input_buffers_dict.count(grad_node)) {
@@ -192,19 +202,38 @@ void RunBackward(const std::vector<egr::EagerTensor>& tensors,
        // Since we make edge has as same rank as bwd outputs, we indexing them
        // with
        // the same rank(i, j)
-        VLOG(6) << "Get Edge with slot: " << i << ", rank: " << j;
-        egr::EagerTensor& grad_output_tensor = grad_output_tensors[i][j];
-        if (!grad_output_tensor.defined() ||
-            !grad_output_tensor.initialized()) {
-          VLOG(6) << "We get grad_output_tensor with slot: " << i
-                  << ", rank: " << j << " as uninitialized or undefined tensor";
-        }
-        GradNodeBase* next_node = edge.GetMutableGradNode().get();
+        auto next_node_shared = edge.GetMutableGradNode();

        // Next node could be nullptr if it is leaf tensor with no
        // AccumulationNode attached
        // Or it could also originated from dispensable inputs
-        if (!next_node) continue;
+        if (!next_node_shared || !next_node_shared.get() ||
+            grad_output_tensors[i].empty()) {
+          continue;
+        }
+        PADDLE_ENFORCE_LT(
+            j, grad_output_tensors[i].size(),
+            paddle::platform::errors::Fatal(
+                "Rank of grad_output_tensors should be less than "
+                "grad_output_tensors[i].size(), which is: %d. This error may "
+                "indicate autoprune or autograd api error. ",
+                grad_output_tensors.size()));
+        egr::EagerTensor& grad_output_tensor = grad_output_tensors[i][j];
+
+        if ((!grad_output_tensor.defined() ||
+             !grad_output_tensor.initialized())) {
+          if (!grad_output_tensor.Var().IsInitialized()) {
+            VLOG(6)
+                << "We get grad_output_tensor with slot: " << i
+                << ", rank: " << j
+                << " as uninitialized or undefined in both tensor and variable";
+          }
+        }
+        VLOG(6) << "Get Edge and grad_output_tensor with slot: " << i
+                << ", rank: " << j
+                << " 's name is: " << grad_output_tensor.name();
+
+        auto* next_node = next_node_shared.get();

        if (!node_input_buffers_dict.count(next_node)) {
          node_input_buffers_dict[next_node] =

--- a/paddle/fluid/eager/eager_tensor.h
+++ b/paddle/fluid/eager/eager_tensor.h
@@ -164,6 +164,14 @@ class EagerTensor final {
   */
  void reset() { tensor_->reset(); }

+  /**
+   * @brief Determine whether tensor is DenseTensor
+   *
+   * @return true
+   * @return false
+   */
+  bool is_dense_tensor() const { return tensor_->is_dense_tensor(); }
+
  /**
 * @brief Transfer the current Tensor to the specified device and return.
 *

--- a/paddle/fluid/eager/tests/task_tests/backward_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/backward_test.cc
@@ -56,6 +56,7 @@ TEST(Backward, SingleNodeEmptyGrad) {
    auto_grad_meta->SetGradNode(
        std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
    auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta->SetStopGradient(false);

    // Connect Tensor and AccumulationNode via AutoGradMeta
    auto acc_node_ptr = std::make_shared<egr::GradNodeAccumulation>();
@@ -119,7 +120,7 @@ TEST(Backward, SingleNodeCustomGrad) {
    auto_grad_meta->SetGradNode(
        std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
    auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
-
+    auto_grad_meta->SetStopGradient(false);
    // Connect Tensor and AccumulationNode via AutoGradMeta
    auto acc_node_ptr = std::make_shared<egr::GradNodeAccumulation>();

@@ -189,7 +190,7 @@ TEST(Backward, LinearNodes) {
    auto_grad_meta->SetGradNode(
        std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
    auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
-
+    auto_grad_meta->SetStopGradient(false);
    // Connect Node0 -> Node1 via Edge
    auto meta0 = egr::AutogradMeta();
    meta0.SetStopGradient(false);
@@ -281,13 +282,14 @@ TEST(Backward, WithAccumulation) {
    auto_grad_meta0->SetGradNode(
        std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
    auto_grad_meta0->SetSingleOutRankWithSlot(0, 0);
-
+    auto_grad_meta0->SetStopGradient(false);
    // Connect Inp1 and Node1 via AutoGradMeta
    AutogradMeta* auto_grad_meta1 =
        EagerUtils::autograd_meta(&(target_tensors[1]));
    auto_grad_meta1->SetGradNode(
        std::dynamic_pointer_cast<GradNodeBase>(node1_ptr));
    auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta1->SetStopGradient(false);

    // Connect Node0 -> Node2 via Edge
    auto meta0 = egr::AutogradMeta();

--- a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
@@ -58,6 +58,7 @@ TEST(CrossBatchAccumulation, SingleScaleNode) {
    auto_grad_meta->SetGradNode(
        std::dynamic_pointer_cast<GradNodeBase>(scale_node_ptr));
    auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta->SetStopGradient(false);
    egr_utils_api::RetainGradForTensor(target_tensor);  // result: 1.0

    auto meta = AutogradMeta();

--- a/paddle/fluid/eager/tests/task_tests/hook_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc
@@ -93,6 +93,7 @@ TEST(RetainGrad, HookBeforeRetainGrad) {
    auto_grad_meta->SetGradNode(
        std::dynamic_pointer_cast<GradNodeBase>(scale_node_ptr));
    auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta->SetStopGradient(false);
    target_tensor.set_autograd_meta(
        std::dynamic_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
            auto_grad_meta));
@@ -171,6 +172,7 @@ TEST(RetainGrad, HookAfterRetainGrad) {
    auto_grad_meta->SetGradNode(
        std::dynamic_pointer_cast<GradNodeBase>(scale_node_ptr));
    auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
+    auto_grad_meta->SetStopGradient(false);
    target_tensor.set_autograd_meta(
        std::dynamic_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
            auto_grad_meta));

--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -99,7 +99,12 @@ std::pair<size_t, size_t> EagerUtils::OutRankInfo(

 std::shared_ptr<GradNodeBase> EagerUtils::grad_node(
    const egr::EagerTensor& target) {
-  return unsafe_autograd_meta(target)->GetMutableGradNode();
+  auto* meta = nullable_autograd_meta(target);
+  if (meta) {
+    return meta->GetMutableGradNode();
+  } else {
+    return nullptr;
+  }
 }

 void EagerUtils::SetHistory(std::vector<AutogradMeta*>* autograd_metas,

--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -298,6 +298,21 @@ static PyObject* eager_tensor_method_detach(EagerTensorObject* self,
  EAGER_CATCH_AND_THROW_RETURN_NULL
 }

+static PyObject* eager_tensor_method_get_underline_tensor(
+    EagerTensorObject* self, PyObject* args, PyObject* kwargs) {
+  EAGER_SYNC_TRY
+  if (self->eager_tensor.is_dense_tensor()) {
+    auto* tensor = static_cast<paddle::framework::LoDTensor*>(
+        self->eager_tensor.impl().get());
+    VLOG(6) << "tensor: " << tensor->IsInitialized();
+    return ToPyObject(tensor);
+  } else {
+    Py_IncRef(Py_None);
+    return Py_None;
+  }
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 PyMethodDef variable_methods[] = {
    {"numpy", (PyCFunction)(void (*)(void))eager_tensor_method_numpy,
     METH_VARARGS | METH_KEYWORDS, NULL},
@@ -315,14 +330,17 @@ PyMethodDef variable_methods[] = {
     METH_VARARGS | METH_KEYWORDS, NULL},
    {"_zero_grads", (PyCFunction)(void (*)(void))eager_tensor__zero_grads,
     METH_VARARGS | METH_KEYWORDS, NULL},
-    {"_is_shared_buffer_to",
+    {"_share_buffer_to",
     (PyCFunction)(void (*)(void))eager_tensor__share_buffer_to,
     METH_VARARGS | METH_KEYWORDS, NULL},
-    {"_share_buffer_with",
+    {"_is_shared_buffer_with",
     (PyCFunction)(void (*)(void))eager_tensor__is_shared_buffer_with,
     METH_VARARGS | METH_KEYWORDS, NULL},
    {"detach", (PyCFunction)(void (*)(void))eager_tensor_method_detach,
     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"get_tensor",
+     (PyCFunction)(void (*)(void))eager_tensor_method_get_underline_tensor,
+     METH_VARARGS | METH_KEYWORDS, NULL},
    {NULL, NULL, 0, NULL}};

 }  // namespace pybind

--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -42,6 +42,18 @@ PyObject* eager_tensor_properties_get_name(EagerTensorObject* self,
  EAGER_CATCH_AND_THROW_RETURN_NULL
 }

+PyObject* eager_tensor_properties_get_type(EagerTensorObject* self,
+                                           void* closure) {
+  EAGER_SYNC_TRY
+  if (self->eager_tensor.is_dense_tensor()) {
+    return ToPyObject(paddle::framework::proto::VarType::LOD_TENSOR);
+  } else {
+    Py_INCREF(Py_None);
+    return Py_None;
+  }
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 int eager_tensor_properties_set_name(EagerTensorObject* self, PyObject* value,
                                     void* closure) {
  EAGER_SYNC_TRY
@@ -74,8 +86,13 @@ PyObject* eager_tensor_properties_get_grad(EagerTensorObject* self,
    return ToPyObject(*accumulation_grad_node->Grad());
  } else {
    VLOG(6) << "Get grad for tensor: " << self->eager_tensor.name();
-    auto meta = egr::EagerUtils::unsafe_autograd_meta(self->eager_tensor);
-    return ToPyObject(meta->Grad());
+    auto meta = egr::EagerUtils::nullable_autograd_meta(self->eager_tensor);
+    if (meta) {
+      return ToPyObject(meta->Grad());
+    } else {
+      Py_INCREF(Py_None);
+      return Py_None;
+    }
  }
  EAGER_CATCH_AND_THROW_RETURN_NULL
 }
@@ -185,6 +202,8 @@ struct PyGetSetDef variable_properties[] = {
     nullptr, nullptr},
    {"dtype", (getter)eager_tensor_properties_get_dtype, nullptr, nullptr,
     nullptr},
+    {"type", (getter)eager_tensor_properties_get_type, nullptr, nullptr,
+     nullptr},
    {nullptr, nullptr, nullptr, nullptr, nullptr}};

 }  // namespace pybind

--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -450,6 +450,18 @@ PyObject* ToPyObject(const paddle::framework::proto::VarType::Type& dtype) {
  return obj.ptr();
 }

+PyObject* ToPyObject(const paddle::framework::proto::VarType& type) {
+  auto obj = ::pybind11::cast(type);
+  obj.inc_ref();
+  return obj.ptr();
+}
+
+PyObject* ToPyObject(const paddle::framework::LoDTensor* value) {
+  auto obj = ::pybind11::cast(value, py::return_value_policy::copy);
+  obj.inc_ref();
+  return obj.ptr();
+}
+
 PyObject* ToPyObject(const void* value) {
  if (value == nullptr) {
    Py_INCREF(Py_None);

--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -11,6 +11,7 @@ limitations under the License. */
 #pragma once

 #include <Python.h>
+#include "paddle/pten/core/dense_tensor.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"

@@ -54,7 +55,9 @@ PyObject* ToPyObject(const std::vector<float>& value);
 PyObject* ToPyObject(const std::vector<double>& value);
 PyObject* ToPyObject(const std::vector<egr::EagerTensor>& value);
 PyObject* ToPyObject(const platform::Place& value);
+PyObject* ToPyObject(const framework::LoDTensor* value);
 PyObject* ToPyObject(const paddle::framework::proto::VarType::Type& dtype);
+PyObject* ToPyObject(const paddle::framework::proto::VarType& type);
 PyObject* ToPyObject(const void* value);
 PyObject* ToPyObject(
    const std::unordered_map<std::string, std::vector<std::string>>& value);

--- a/paddle/pten/core/dense_tensor.cc
+++ b/paddle/pten/core/dense_tensor.cc
@@ -285,6 +285,9 @@ const paddle::platform::Place& DenseTensor::place() const {
      storage_,
      paddle::platform::errors::PreconditionNotMet(
          "Tensor not initialized yet when Tensor::place() is called."));
+  if (storage_->data_shared()) {
+    return storage_->data_shared()->place();
+  }
  return storage_->place();
 }


--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -758,10 +758,10 @@ def monkey_patch_varbase():

    @framework.dygraph_only
    def _grad_ivar(self):
-        if self.grad._is_initialized():
-            return self.grad
-        else:
-            return None
+        if self.grad is not None:
+            if self.grad._is_initialized():
+                return self.grad
+        return None

    @framework.dygraph_only
    def _set_grad_ivar(self, value):
@@ -782,6 +782,10 @@ def monkey_patch_varbase():
    def clone(self):
        return _C_ops_.assign(self)

+    @framework.dygraph_only
+    def value(self):
+        return self
+
    if core._in_eager_mode() and not hasattr(core, "eager"):
        return

@@ -805,6 +809,7 @@ def monkey_patch_varbase():
        setattr(core.eager.EagerTensor, "_set_grad_ivar", _set_grad_ivar)
        setattr(core.eager.EagerTensor, "clear_gradient", clear_gradient)
        setattr(core.eager.EagerTensor, "clone", clone)
+        setattr(core.eager.EagerTensor, "value", value)
    else:
        setattr(core.VarBase, "__name__", "Tensor")
        setattr(core.VarBase, "grad", grad)

--- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
@@ -109,7 +109,7 @@ class EagerDtypeTestCase(unittest.TestCase):
                                        core.VarDesc.VarType.COMPLEX128)


-class EagerTensorPropertiesTestCase(unittest.TestCase):
+class EagerTensorPropertiesAndMethodsTestCase(unittest.TestCase):
    def constructor(self, place):
        egr_tensor = core.eager.EagerTensor()
        self.assertEqual(egr_tensor.persistable, False)
@@ -645,7 +645,8 @@ class EagerTensorPropertiesTestCase(unittest.TestCase):
                self.assertTrue(tensor3.stop_gradient, True)
                self.assertTrue(tensor3.place.is_cpu_place())

-        def test_share_buffer_to():
+    def test_share_buffer_to(self):
+        with _test_eager_guard():
            arr = np.ones([4, 16, 16, 32]).astype('float32')
            arr1 = np.zeros([4, 16]).astype('float32')
            arr2 = np.ones([4, 16, 16, 32]).astype('float32') + np.ones(
@@ -661,7 +662,7 @@ class EagerTensorPropertiesTestCase(unittest.TestCase):
            else:
                tensor2 = paddle.to_tensor(arr2, core.VarDesc.VarType.FP32,
                                           core.CPUPlace())
-            self.assertTrue(np.array_equal(tensor.numpy(), arr1))
+            self.assertTrue(np.array_equal(tensor.numpy(), arr))
            self.assertTrue(np.array_equal(tensor2.numpy(), arr2))
            tensor2._share_buffer_to(tensor)
            self.assertTrue(np.array_equal(tensor.numpy(), arr2))
@@ -694,6 +695,7 @@ class EagerTensorPropertiesTestCase(unittest.TestCase):
            self.assertEqual(tensor.stop_gradient, False)
            tensor.stop_gradient = True
            self.assertEqual(tensor.stop_gradient, True)
+            self.assertEqual(tensor.type, core.VarDesc.VarType.LOD_TENSOR)

    def test_global_properties(self):
        print("Test_global_properties")
@@ -714,6 +716,25 @@ class EagerTensorPropertiesTestCase(unittest.TestCase):
                self.assertTrue(core.eager._get_expected_place().is_cpu_place())
        core._disable_eager_mode()

+    def test_value(self):
+        with _test_eager_guard():
+            arr = np.random.rand(4, 16, 16, 32).astype('float64')
+
+            egr_tensor0 = core.eager.EagerTensor(value=arr)
+            self.assertEqual(egr_tensor0.persistable, False)
+            self.assertTrue("generated" in egr_tensor0.name)
+            self.assertEqual(egr_tensor0.shape, [4, 16, 16, 32])
+            self.assertTrue(
+                egr_tensor0.place._equals(
+                    paddle.fluid.framework._current_expected_place()))
+            self.assertEqual(egr_tensor0.dtype, core.VarDesc.VarType.FP64)
+            self.assertEqual(egr_tensor0.stop_gradient, True)
+            self.assertTrue(egr_tensor0.value().get_tensor()._dtype(),
+                            core.VarDesc.VarType.FP64)
+            self.assertTrue(egr_tensor0.value().get_tensor()._place(),
+                            paddle.fluid.framework._current_expected_place())
+            self.assertTrue(egr_tensor0.value().get_tensor()._is_initialized())
+

 class EagerParamBaseUsageTestCase(unittest.TestCase):
    def test_print(self):
@@ -803,6 +824,7 @@ class EagerParamBaseUsageTestCase(unittest.TestCase):
            self.assertTrue(egr_tensor12.place._equals(paddle.fluid.CPUPlace()))
            self.assertTrue(np.array_equal(egr_tensor12.numpy(), arr4))
            self.assertTrue(np.array_equal(egr_tensor12.gradient(), None))
+            egr_tensor12.stop_gradient = False
            egr_tensor12.backward()
            self.assertTrue(np.array_equal(egr_tensor12.gradient(), arr))


--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
@@ -181,6 +181,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
            self.func_auto_prune2()
        self.func_auto_prune2()

+    # TODO(jiabin): Support this when we support better split tensor
    def test_auto_prune3(self):
        with fluid.dygraph.guard():
            case3 = AutoPruneLayer3(input_size=784)
@@ -217,7 +218,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
            self.assertTrue(case4.linear.weight._grad_ivar() is not None)
            self.assertTrue((part2.gradient() == 0).all())

-    def test_auto_prune6(self):
+    def func_auto_prune6(self):
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
@@ -235,7 +236,12 @@ class TestImperativeAutoPrune(unittest.TestCase):
            self.assertTrue(linear.weight.gradient() is None)
            self.assertTrue(out1.gradient() is None)

-    def test_auto_prune7(self):
+    def test_auto_prune6(self):
+        with _test_eager_guard():
+            self.func_auto_prune6()
+        self.func_auto_prune6()
+
+    def func_auto_prune7(self):
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
@@ -253,7 +259,12 @@ class TestImperativeAutoPrune(unittest.TestCase):
            self.assertTrue(linear.weight.gradient() is None)
            self.assertTrue(out1.gradient() is None)

-    def test_auto_prune8(self):
+    def test_auto_prune7(self):
+        with _test_eager_guard():
+            self.func_auto_prune7()
+        self.func_auto_prune7()
+
+    def func_auto_prune8(self):
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
@@ -278,7 +289,12 @@ class TestImperativeAutoPrune(unittest.TestCase):
            self.assertFalse(
                np.array_equal(linear_origin, linear.weight.numpy()))

-    def test_auto_prune9(self):
+    def test_auto_prune8(self):
+        with _test_eager_guard():
+            self.func_auto_prune8()
+        self.func_auto_prune8()
+
+    def func_auto_prune9(self):
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
@@ -307,7 +323,12 @@ class TestImperativeAutoPrune(unittest.TestCase):
            except ValueError as e:
                assert type(e) == ValueError

-    def test_auto_prune10(self):
+    def test_auto_prune9(self):
+        with _test_eager_guard():
+            self.func_auto_prune9()
+        self.func_auto_prune9()
+
+    def func_auto_prune10(self):
        with fluid.dygraph.guard():
            value0 = np.arange(26).reshape(2, 13).astype("float32")
            value1 = np.arange(6).reshape(2, 3).astype("float32")
@@ -321,12 +342,18 @@ class TestImperativeAutoPrune(unittest.TestCase):
            out2 = linear2(b)
            out1.stop_gradient = True
            out = fluid.layers.concat(input=[out1, out2, c], axis=1)
+            #TODO(jiabin): In Eager Mode we don't actually need sort_sum_gradient, this test should be removed when we don't support fluid anymore.
            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
            out.backward()
            self.assertTrue(linear.weight.gradient() is None)
            self.assertTrue(out1.gradient() is None)

-    def test_auto_prune_with_optimizer(self):
+    def test_auto_prune10(self):
+        with _test_eager_guard():
+            self.func_auto_prune10()
+        self.func_auto_prune10()
+
+    def func_auto_prune_with_optimizer(self):
        vocab_size = 100
        size = 20
        batch_size = 16
@@ -341,7 +368,6 @@ class TestImperativeAutoPrune(unittest.TestCase):
            grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)
            optimizer = fluid.optimizer.AdamOptimizer(
                0.001, parameter_list=model.parameters(), grad_clip=grad_clip)
-
            indices = fluid.dygraph.to_variable(indices)
            embed = fluid.dygraph.to_variable(embed)
            dummy_loss = model(embed)
@@ -374,7 +400,12 @@ class TestImperativeAutoPrune(unittest.TestCase):
            assert model.embed1.weight._grad_ivar() is None
            assert model.linear_1.weight._grad_ivar() is None

-    def test_case2_prune_no_grad_branch(self):
+    def test_auto_prune_with_optimizer(self):
+        with _test_eager_guard():
+            self.func_auto_prune_with_optimizer()
+        self.func_auto_prune_with_optimizer()
+
+    def func_case2_prune_no_grad_branch(self):
        with fluid.dygraph.guard():
            value1 = np.arange(784).reshape(1, 784)
            value2 = np.arange(1).reshape(1, 1)
@@ -386,7 +417,12 @@ class TestImperativeAutoPrune(unittest.TestCase):
            self.assertTrue(case3.linear2.weight._grad_ivar() is None)
            self.assertTrue(case3.linear.weight._grad_ivar() is not None)

-    def test_case3_prune_no_grad_branch2(self):
+    def test_case2_prune_no_grad_branch(self):
+        with _test_eager_guard():
+            self.func_case2_prune_no_grad_branch()
+        self.func_case2_prune_no_grad_branch()
+
+    def func_case3_prune_no_grad_branch2(self):
        with fluid.dygraph.guard():
            value1 = np.arange(1).reshape(1, 1)
            linear = fluid.dygraph.Linear(1, 1, act=None)
@@ -399,13 +435,23 @@ class TestImperativeAutoPrune(unittest.TestCase):
            loss.backward()
            self.assertTrue(linear.weight._grad_ivar() is None)

-    def test_case4_with_no_grad_op_maker(self):
+    def test_case3_prune_no_grad_branch2(self):
+        with _test_eager_guard():
+            self.func_case3_prune_no_grad_branch2()
+        self.func_case3_prune_no_grad_branch2()
+
+    def func_case4_with_no_grad_op_maker(self):
        with fluid.dygraph.guard():
            out = fluid.layers.gaussian_random(shape=[20, 30])
            loss = fluid.layers.mean(out)
            loss.backward()
            self.assertTrue(out._grad_ivar() is None)

+    def test_case4_with_no_grad_op_maker(self):
+        with _test_eager_guard():
+            self.func_case4_with_no_grad_op_maker()
+        self.func_case4_with_no_grad_op_maker()
+

 if __name__ == '__main__':
    unittest.main()