diff --git a/paddle/fluid/pybind/eager.h b/paddle/fluid/pybind/eager.h
index 5560744ae1d49cb5fb44a0e743d48e1be1f074c5..f617ead08e24329aa69bf38f5e55715caed0ba9a 100644
--- a/paddle/fluid/pybind/eager.h
+++ b/paddle/fluid/pybind/eager.h
@@ -29,7 +29,7 @@ typedef struct {
 typedef struct {
   PyObject_HEAD PyObject* container;
   PyObject* non_differentiable;
-  PyObject* dirty_tensors;
+  PyObject* not_inplace_tensors;
   bool materialize_grads;
   std::vector<bool> forward_input_tensor_is_duplicable;
   std::vector<bool> forward_output_tensor_is_duplicable;
diff --git a/paddle/fluid/pybind/eager_py_layer.cc b/paddle/fluid/pybind/eager_py_layer.cc
index b841afff1579f3db4001d041f02a0f0c92a47443..7e25b06e80a4dd70b4fcbaa55021b199e6998580 100644
--- a/paddle/fluid/pybind/eager_py_layer.cc
+++ b/paddle/fluid/pybind/eager_py_layer.cc
@@ -92,8 +92,8 @@ static void PyLayerDealloc(PyLayerObject* self) {
   if (self->non_differentiable) {
     Py_DECREF(self->non_differentiable);
   }
-  if (self->dirty_tensors) {
-    Py_DECREF(self->dirty_tensors);
+  if (self->not_inplace_tensors) {
+    Py_DECREF(self->not_inplace_tensors);
   }
   self->grad_node.~weak_ptr<egr::GradNodePyLayer>();
   self->forward_input_tensor_is_duplicable.~vector();
@@ -108,6 +108,20 @@ PyObject* pylayer_method_name(PyObject* self, PyObject* noargs) {
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+PyObject* new_tensor_with_impl(paddle::experimental::Tensor* tensor) {
+  PyObject* obj = p_tensor_type->tp_alloc(p_tensor_type, 0);
+  if (obj) {
+    auto v = reinterpret_cast<TensorObject*>(obj);
+    new (&(v->tensor)) paddle::experimental::Tensor();
+    v->tensor.set_impl(tensor->impl());
+    v->tensor.set_name(egr::Controller::Instance().GenerateUniqueName());
+  } else {
+    PADDLE_THROW(platform::errors::Fatal(
+        "tp_alloc return null, can not new a PyObject."));
+  }
+  return obj;
+}
+
 PyObject* pylayer_method_apply(PyObject* cls,
                                PyObject* args,
                                PyObject* kwargs) {
@@ -151,6 +165,7 @@ PyObject* pylayer_method_apply(PyObject* cls,
   inputs_tensor.reserve(inputs_size);
   ctx->forward_input_tensor_is_duplicable.clear();
   ctx->forward_input_tensor_is_duplicable.reserve(inputs_size);
+  std::set<phi::TensorBase*> input_tensorbases;
   for (size_t i = 0; i < inputs_size; i++) {
     PyObject* obj = nullptr;
     if (i >= args_size) {
@@ -159,6 +174,8 @@ PyObject* pylayer_method_apply(PyObject* cls,
       obj = PyTuple_GET_ITEM(args, i);
     }
     if (IsEagerTensor(obj)) {
+      input_tensorbases.insert(
+          reinterpret_cast<TensorObject*>(obj)->tensor.impl().get());
       auto autograd_meta = egr::EagerUtils::nullable_autograd_meta(
           reinterpret_cast<TensorObject*>(obj)->tensor);
       inputs_autograd_meta.push_back({autograd_meta});
@@ -173,10 +190,12 @@ PyObject* pylayer_method_apply(PyObject* cls,
     } else if (PyList_Check(obj)) {
       std::vector<paddle::experimental::Tensor*> tensors;
       Py_ssize_t len = PyList_Size(obj);
-      for (Py_ssize_t i = 0; i < len; i++) {
-        if (IsEagerTensor(PyList_GetItem(obj, i))) {
-          tensors.push_back(&(
-              reinterpret_cast<TensorObject*>(PyList_GetItem(obj, i))->tensor));
+      for (Py_ssize_t j = 0; j < len; j++) {
+        PyObject* o = PyList_GetItem(obj, j);
+        if (IsEagerTensor(o)) {
+          input_tensorbases.insert(
+              reinterpret_cast<TensorObject*>(o)->tensor.impl().get());
+          tensors.push_back(&(reinterpret_cast<TensorObject*>(o)->tensor));
         }
       }
       if (!tensors.empty()) {
@@ -194,11 +213,12 @@ PyObject* pylayer_method_apply(PyObject* cls,
     } else if (PyTuple_Check(obj)) {
       std::vector<paddle::experimental::Tensor*> tensors;
       Py_ssize_t len = PyTuple_Size(obj);
-      for (Py_ssize_t i = 0; i < len; i++) {
-        if (IsEagerTensor(PyTuple_GetItem(obj, i))) {
-          tensors.push_back(
-              &(reinterpret_cast<TensorObject*>(PyTuple_GetItem(obj, i))
-                    ->tensor));
+      for (Py_ssize_t j = 0; j < len; j++) {
+        PyObject* o = PyTuple_GetItem(obj, j);
+        if (IsEagerTensor(o)) {
+          input_tensorbases.insert(
+              reinterpret_cast<TensorObject*>(o)->tensor.impl().get());
+          tensors.push_back(&(reinterpret_cast<TensorObject*>(o)->tensor));
         }
       }
       if (!tensors.empty()) {
@@ -252,6 +272,13 @@ PyObject* pylayer_method_apply(PyObject* cls,
     PyTuple_SET_ITEM(outputs_tuple, 0, outputs);
   }
 
+  std::set<paddle::experimental::Tensor*> inplace_tensors;
+  std::set<phi::TensorBase*> not_inplace_tensorbases;
+  auto not_inplace_tensors = GetTensorsFromPyObject(ctx->not_inplace_tensors);
+  for (auto it : not_inplace_tensors) {
+    not_inplace_tensorbases.insert(it->impl().get());
+  }
+
   auto outputs_size = PyTuple_GET_SIZE(outputs_tuple);
   std::vector<std::vector<paddle::experimental::Tensor*>> outputs_tensor;
   outputs_tensor.reserve(outputs_size);
@@ -267,13 +294,39 @@ PyObject* pylayer_method_apply(PyObject* cls,
       outputs_autograd_meta.push_back({egr::EagerUtils::autograd_meta(
           &(reinterpret_cast<TensorObject*>(obj)->tensor))});
       ctx->forward_output_tensor_is_duplicable.push_back(false);
+      if (input_tensorbases.count(
+              reinterpret_cast<TensorObject*>(obj)->tensor.impl().get())) {
+        if (not_inplace_tensorbases.count(
+                reinterpret_cast<TensorObject*>(obj)->tensor.impl().get())) {
+          PyTuple_SET_ITEM(outputs_tuple,
+                           i,
+                           new_tensor_with_impl(&(
+                               reinterpret_cast<TensorObject*>(obj)->tensor)));
+        } else {
+          inplace_tensors.insert(
+              &(reinterpret_cast<TensorObject*>(obj)->tensor));
+        }
+      }
     } else if (PyList_Check(obj)) {
       std::vector<paddle::experimental::Tensor*> tensors;
       Py_ssize_t len = PyList_Size(obj);
-      for (Py_ssize_t i = 0; i < len; i++) {
-        if (IsEagerTensor(PyList_GetItem(obj, i))) {
-          tensors.push_back(&(
-              reinterpret_cast<TensorObject*>(PyList_GetItem(obj, i))->tensor));
+      for (Py_ssize_t j = 0; j < len; j++) {
+        PyObject* o = PyList_GetItem(obj, j);
+        if (IsEagerTensor(o)) {
+          tensors.push_back(&(reinterpret_cast<TensorObject*>(o)->tensor));
+          if (input_tensorbases.count(
+                  reinterpret_cast<TensorObject*>(o)->tensor.impl().get())) {
+            if (not_inplace_tensorbases.count(
+                    reinterpret_cast<TensorObject*>(o)->tensor.impl().get())) {
+              PyTuple_SetItem(obj,
+                              j,
+                              new_tensor_with_impl(&(
+                                  reinterpret_cast<TensorObject*>(o)->tensor)));
+            } else {
+              inplace_tensors.insert(
+                  &(reinterpret_cast<TensorObject*>(o)->tensor));
+            }
+          }
         }
       }
       if (!tensors.empty()) {
@@ -285,11 +338,23 @@ PyObject* pylayer_method_apply(PyObject* cls,
     } else if (PyTuple_Check(obj)) {
       std::vector<paddle::experimental::Tensor*> tensors;
       Py_ssize_t len = PyTuple_Size(obj);
-      for (Py_ssize_t i = 0; i < len; i++) {
-        if (IsEagerTensor(PyTuple_GetItem(obj, i))) {
-          tensors.push_back(
-              &(reinterpret_cast<TensorObject*>(PyTuple_GetItem(obj, i))
-                    ->tensor));
+      for (Py_ssize_t j = 0; j < len; j++) {
+        PyObject* o = PyTuple_GetItem(obj, j);
+        if (IsEagerTensor(o)) {
+          tensors.push_back(&(reinterpret_cast<TensorObject*>(o)->tensor));
+          if (input_tensorbases.count(
+                  reinterpret_cast<TensorObject*>(o)->tensor.impl().get())) {
+            if (not_inplace_tensorbases.count(
+                    reinterpret_cast<TensorObject*>(o)->tensor.impl().get())) {
+              PyTuple_SetItem(obj,
+                              j,
+                              new_tensor_with_impl(&(
+                                  reinterpret_cast<TensorObject*>(o)->tensor)));
+            } else {
+              inplace_tensors.insert(
+                  &(reinterpret_cast<TensorObject*>(o)->tensor));
+            }
+          }
         }
       }
       if (!tensors.empty()) {
@@ -320,21 +385,19 @@ PyObject* pylayer_method_apply(PyObject* cls,
       }
     }
 
-    // add inplace strategy, inplaced tensor is ctx->dirty_tensors
-    auto dirty_tensors = GetTensorsFromPyObject(ctx->dirty_tensors);
-    for (auto it = dirty_tensors.begin(); it != dirty_tensors.end(); ++it) {
-      auto dirty_tensor = *it;
-      auto dirty_tensor_autograd_meta =
-          egr::EagerUtils::autograd_meta(dirty_tensor);
-      PADDLE_ENFORCE_EQ(!dirty_tensor_autograd_meta->StopGradient() &&
-                            egr::egr_utils_api::IsLeafTensor(*dirty_tensor),
+    for (auto it = inplace_tensors.begin(); it != inplace_tensors.end(); ++it) {
+      auto inplace_tensor = *it;
+      auto inplace_tensor_autograd_meta =
+          egr::EagerUtils::autograd_meta(inplace_tensor);
+      PADDLE_ENFORCE_EQ(!inplace_tensor_autograd_meta->StopGradient() &&
+                            egr::egr_utils_api::IsLeafTensor(*inplace_tensor),
                         false,
                         paddle::platform::errors::InvalidArgument(
                             "Leaf Var (%s) that doesn't stop gradient "
                             "can't use inplace strategy.",
-                            dirty_tensor->name()));
-      dirty_tensor->bump_inplace_version();
-      VLOG(3) << "Tensor(" << dirty_tensor->name()
+                            inplace_tensor->name()));
+      inplace_tensor->bump_inplace_version();
+      VLOG(3) << "Tensor(" << inplace_tensor->name()
               << ") uses Inplace Strategy.";
     }
 
@@ -376,7 +439,10 @@ PyObject* pylayer_method_apply(PyObject* cls,
     VLOG(6) << "PyLayer construct backward node finish...";
   }
 
-  if (!PyTuple_Check(outputs)) {
+  if (outputs_size == 1) {
+    Py_XDECREF(outputs);
+    outputs = PyTuple_GetItem(outputs_tuple, 0);
+    Py_INCREF(outputs);
     Py_XDECREF(outputs_tuple);
   }
   Py_XDECREF(forward_args);
@@ -389,12 +455,6 @@ PyObject* pylayer_method_apply(PyObject* cls,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-PyObject* pylayer_method_register_hook(PyObject* _self, PyObject* hook) {
-  EAGER_TRY
-  return nullptr;
-  EAGER_CATCH_AND_THROW_RETURN_NULL
-}
-
 PyObject* tensor_properties_get_container(PyLayerObject* self, void* closure) {
   EAGER_TRY
   if (self->container == nullptr) {
@@ -438,24 +498,24 @@ int tensor_properties_set_non_differentiable(PyLayerObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NEG
 }
 
-PyObject* tensor_properties_get_dirty_tensors(PyLayerObject* self,
-                                              void* closure) {
+PyObject* tensor_properties_get_not_inplace_tensors(PyLayerObject* self,
+                                                    void* closure) {
   EAGER_TRY
-  if (self->dirty_tensors == nullptr) {
+  if (self->not_inplace_tensors == nullptr) {
     RETURN_PY_NONE;
   }
-  Py_INCREF(self->dirty_tensors);
-  return self->dirty_tensors;
+  Py_INCREF(self->not_inplace_tensors);
+  return self->not_inplace_tensors;
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-int tensor_properties_set_dirty_tensors(PyLayerObject* self,
-                                        PyObject* value,
-                                        void* closure) {
+int tensor_properties_set_not_inplace_tensors(PyLayerObject* self,
+                                              PyObject* value,
+                                              void* closure) {
   EAGER_TRY
   Py_XINCREF(value);
-  Py_XDECREF(self->dirty_tensors);
-  self->dirty_tensors = value;
+  Py_XDECREF(self->not_inplace_tensors);
+  self->not_inplace_tensors = value;
   return 0;
   EAGER_CATCH_AND_THROW_RETURN_NEG
 }
@@ -478,10 +538,6 @@ PyMethodDef pylayer_methods[] = {
      (PyCFunction)(void (*)(void))pylayer_method_apply,
      METH_CLASS | METH_VARARGS | METH_KEYWORDS,
      NULL},
-    {"register_hook",
-     (PyCFunction)(void (*)(void))pylayer_method_register_hook,
-     METH_O,
-     NULL},
     {NULL, NULL, 0, NULL}};
 
 struct PyGetSetDef pylayer_properties[] {
@@ -495,9 +551,9 @@ struct PyGetSetDef pylayer_properties[] {
        (setter)tensor_properties_set_non_differentiable,
        nullptr,
        nullptr},
-      {"dirty_tensors",
-       (getter)tensor_properties_get_dirty_tensors,
-       (setter)tensor_properties_set_dirty_tensors,
+      {"not_inplace_tensors",
+       (getter)tensor_properties_get_not_inplace_tensors,
+       (setter)tensor_properties_set_not_inplace_tensors,
        nullptr,
        nullptr},
       {"materialize_grads",
diff --git a/python/paddle/autograd/py_layer.py b/python/paddle/autograd/py_layer.py
index 22fc8bf47c1840d7a87699725225639bddc0fb2b..673b047d5a3bad96e47749c739444c53257fba57 100644
--- a/python/paddle/autograd/py_layer.py
+++ b/python/paddle/autograd/py_layer.py
@@ -407,13 +407,50 @@ class EagerPyLayerContext(object):
         """
         return self.container
 
-    def mark_dirty(self, *args):
-        self.dirty_tensors = args
+    def mark_not_inplace(self, *args):
+        """
+        Marks inputs as not inplace.
+        This should be called at most once, only from inside the `forward` method, 
+        and all arguments should be Tensor inputs.
+
+        If the Tensor returned by `forward` method is the same as the Tensor input of forward, 
+        and this Tensor is marked as not_inplace, then Paddle will help the user create a new Tensor as output. 
+        Thereby preventing the auto grad information of the input Tensor from being overwritten.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                class Exp(paddle.autograd.PyLayer):
+                    @staticmethod
+                    def forward(ctx, x):
+                        ctx.mark_not_inplace(x)
+                        return x
+                    
+                    @staticmethod
+                    def backward(ctx, grad_output):
+                        out = grad_output.exp()
+                        return out
+
+                x = paddle.randn((1, 1))
+                x.stop_gradient = False
+                attn_layers = []
+                for idx in range(0, 2):
+                    attn_layers.append(Exp())
+                
+                for step in range(0, 2):
+                    a = x
+                    for j in range(0,2):
+                        a = attn_layers[j].apply(x)
+                    a.backward()
+        """
+        self.not_inplace_tensors = args
 
     def mark_non_differentiable(self, *args):
         """
         Marks outputs as non-differentiable.
-        This should be called at most once, only from inside thethe `forward` method, 
+        This should be called at most once, only from inside the `forward` method, 
         and all arguments should be tensor outputs.
 
         This will mark outputs as not requiring gradients, increasing the
@@ -475,7 +512,7 @@ class EagerPyLayerContext(object):
                 class Tanh(PyLayer):
                     @staticmethod
                     def forward(ctx, x):
-                        return x, x+x
+                        return x+x+x, x+x
 
                     @staticmethod
                     def backward(ctx, grad, grad2):
@@ -486,7 +523,7 @@ class EagerPyLayerContext(object):
                     @staticmethod
                     def forward(ctx, x):
                         ctx.set_materialize_grads(False)
-                        return x, x+x
+                        return x+x+x, x+x
 
                     @staticmethod
                     def backward(ctx, grad, grad2):
diff --git a/python/paddle/fluid/tests/unittests/test_pylayer_op.py b/python/paddle/fluid/tests/unittests/test_pylayer_op.py
index f7f5e81b841ed5d736455eb34155ba70fac63273..eb6502a97a6b30aca606a8ffeb2ee0c593e6b381 100644
--- a/python/paddle/fluid/tests/unittests/test_pylayer_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pylayer_op.py
@@ -480,7 +480,7 @@ class TestPyLayer(unittest.TestCase):
                 super(Layer, self).__init__()
 
             def forward(self, data):
-                data = paddle.nn.functional.relu(data)
+                data = data**2
                 z = paddle.tanh(data)
                 z = cus_tanh.apply(data)
                 return z.mean()
@@ -506,7 +506,6 @@ class TestPyLayer(unittest.TestCase):
 
                 @staticmethod
                 def forward(ctx, x):
-                    ctx.mark_dirty(x)
                     return x
 
                 @staticmethod
@@ -543,7 +542,6 @@ class TestPyLayer(unittest.TestCase):
 
                 @staticmethod
                 def forward(ctx, x):
-                    ctx.mark_dirty(x)
                     return x
 
                 @staticmethod
@@ -578,7 +576,6 @@ class TestPyLayer(unittest.TestCase):
 
                 @staticmethod
                 def forward(ctx, x):
-                    ctx.mark_dirty(x)
                     return x
 
                 @staticmethod
@@ -612,8 +609,6 @@ class TestPyLayer(unittest.TestCase):
 
             @staticmethod
             def forward(ctx, x):
-                if in_dygraph_mode():
-                    ctx.mark_dirty(x)
                 return x
 
             @staticmethod
@@ -710,6 +705,7 @@ class TestPyLayer(unittest.TestCase):
 
                 @staticmethod
                 def forward(ctx, x):
+                    ctx.mark_not_inplace(x)
                     return x, x + x
 
                 @staticmethod
@@ -728,6 +724,7 @@ class TestPyLayer(unittest.TestCase):
 
                 @staticmethod
                 def forward(ctx, x):
+                    ctx.mark_not_inplace(x)
                     ctx.set_materialize_grads(False)
                     return x, x + x