From 8e612903d342f4f717ff195bac3ebc77a2672a10 Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Sat, 19 Mar 2022 00:21:38 +0800
Subject: [PATCH] support inplace in dygraph eager_fluid state (#40400)

* [Eager] Support eager grad interface, draft version

* Support eager grad interface with allow_unused and multi startup_op

* Fix code format

* Fix allow_unused case, return PyNone if tensor not initialize

* Support output's stop_gradient related to create_graph

* Support grad exception case in eager mode, fix coverage CI

* Update ToPyObject, return PyNone if not initialize

* AccumulationNode add FLAGS_retain_grad_for_all_tensor

* Fix ci issue

* Fix CI issue

* fix, use core.eager.Tensor

* Add func SetBufferSlotRankZeros for GradTensorHolder

* Support retain_graph by using ClearTensorWrappers

* Support retain_graph by using ClearTensorWrappers

* Update retain_graph and no_grad_vars related test case

* Update code gen logic for ClearTensorWrappers

* Fix by override statement

* fix override func args

* Support retain_graph, update unit tests

* Updated ClearTensorWrappers logic

* fix grad python interface

* Use deep copy and update unit tests

* Polish code

* Polish code

* Fix CI issue, Deep copy only use when user set grad_tensors

* Fix CI, use Backward instead RunBackward

* Fix CI, Declare kernel explicitly in test file

* Polish, remove vector of TensorWrapper

* Refactor the logic of grad/backward, polish codes

* Update code after merge upstream develop

* Polish after merge upstream develop

* Update to adapt new GradNodeBase superclass

* Fix error introduced during conflict resolution

* support inplace strategy in eager_fluid state

* solve conflict

* nothing

* Update purify potential_startup_nodes logic

* Fix errors

* Polish code

* Remove useless args for ToPyObject

* Remove useless TensorWrappersSet

* fix record conflict

* Fix code-format, re-install pre-commit

* fix tensor_wrapper bug

* Fix pre-process logic for potential_startup_ops

* Update unit tests, use eager mode

* Fix conflicts

* fix unittest timeout

* little change

Co-authored-by: Weilong Wu <veyron_wu@163.com>
---
 paddle/fluid/eager/api/utils/tensor_utils.cc  |   3 +-
 .../auto_code_generator/eager_generator.cc    | 396 ++++++++++++-----
 paddle/fluid/eager/tensor_wrapper.h           |  48 +++
 paddle/fluid/eager/utils.cc                   |  21 +
 paddle/fluid/eager/utils.h                    |  17 +
 paddle/fluid/pybind/eager_method.cc           |  11 +
 .../pybind/eager_op_function_generator.cc     |  73 +++-
 paddle/fluid/pybind/eager_utils.cc            |  16 +
 paddle/fluid/pybind/eager_utils.h             |  44 ++
 paddle/fluid/pybind/op_function_common.cc     |  25 ++
 paddle/fluid/pybind/op_function_common.h      |   5 +
 paddle/phi/api/include/tensor.h               |  16 +-
 paddle/phi/api/lib/tensor.cc                  |  31 ++
 .../fluid/tests/unittests/CMakeLists.txt      |   1 +
 .../unittests/test_inplace_eager_fluid.py     | 397 ++++++++++++++++++
 15 files changed, 991 insertions(+), 113 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py
diff --git a/paddle/fluid/eager/api/utils/tensor_utils.cc b/paddle/fluid/eager/api/utils/tensor_utils.cc
index 77c39d1b0a3..b485beca57a 100644
--- a/paddle/fluid/eager/api/utils/tensor_utils.cc
+++ b/paddle/fluid/eager/api/utils/tensor_utils.cc
@@ -30,7 +30,8 @@ namespace egr_utils_api {
 
 bool IsLeafTensor(const paddle::experimental::Tensor& target) {
   std::shared_ptr<GradNodeBase> grad_node = EagerUtils::grad_node(target);
-  if (std::dynamic_pointer_cast<GradNodeAccumulation>(grad_node)) {
+  if (!grad_node ||
+      std::dynamic_pointer_cast<GradNodeAccumulation>(grad_node)) {
     return true;
   }
 
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index d15c413339a..b8d59e8dd8b 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -979,7 +979,9 @@ static bool CollectGradInformationFromOpInfo(
 /* --------------------------------------------------- */
 static std::string GenerateGradNodeCreationContent(
     const ForwardGenerationInfo& fwd_info,
-    const GradNodeGenerationInfo& bwd_info) {
+    const GradNodeGenerationInfo& bwd_info,
+    const std::string& trace_op_body_str,
+    std::map<std::string, std::string> inplace_map = {}) {
   VLOG(6) << "Generating GradNode Creation codes";
 
   const std::string& op_type = fwd_info.GetOpType();
@@ -998,7 +1000,8 @@ static std::string GenerateGradNodeCreationContent(
   // If single output slotname and not duplicable,
   // then generate: "egr::AutogradMeta* p_autograd_out =
   // egr::EagerUtils::autograd_meta("op_proto->outputs()[0].name()")"
-  std::string get_autograd_meta_str = "  // Prepare Autograd Meta \n";
+  std::string get_input_autograd_meta_str = "  // Prepare Autograd Meta \n";
+  std::string get_output_autograd_meta_str = "";
   // If single output slotname and not duplicable,
   // then generate: "egr::AutogradMeta* p_autograd_out =
   // egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")"
@@ -1006,22 +1009,39 @@ static std::string GenerateGradNodeCreationContent(
     const std::string& output_name = output.name();
     const std::string& output_autograd_name = "p_autograd_" + output_name;
 
+    // output autograd_meta should be got after running TraceOP.
     if (output.duplicable()) {
       const char* GET_MULTI_AUTOGRAD_META_TEMPLATE =
-          "  std::vector<egr::AutogradMeta*> %s = "
+          "    std::vector<egr::AutogradMeta*> %s = "
           "egr::EagerUtils::autograd_meta(&%s);\n";
-      get_autograd_meta_str += paddle::string::Sprintf(
+      get_output_autograd_meta_str += paddle::string::Sprintf(
           GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
     } else {
-      const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
-          "  egr::AutogradMeta* %s = "
-          "egr::EagerUtils::autograd_meta(&%s);\n";
-      get_autograd_meta_str += paddle::string::Sprintf(
-          GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
+      // In inplace op, the case where output is duplicable is not considered.
+      // Replace output directly with input in inplace op.
+      if (!inplace_map.empty() && inplace_map.count(output_name)) {
+        auto inplace_input_name = inplace_map[output_name];
+        const std::string& inplace_input_autograd_name =
+            "p_autograd_" + inplace_input_name;
+        const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
+            "    %s = egr::EagerUtils::autograd_meta(&%s);\n";
+        get_output_autograd_meta_str += paddle::string::Sprintf(
+            GET_SINGLE_AUTOGRAD_META_TEMPLATE, inplace_input_autograd_name,
+            inplace_input_name);
+      } else {
+        const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
+            "    egr::AutogradMeta* %s = "
+            "egr::EagerUtils::autograd_meta(&%s);\n";
+        get_output_autograd_meta_str +=
+            paddle::string::Sprintf(GET_SINGLE_AUTOGRAD_META_TEMPLATE,
+                                    output_autograd_name, output_name);
+      }
     }
   }
   VLOG(6) << "Generated outputs autograd_meta";
 
+  // input autograd_meta should be got before running TraceOP (for checking
+  // inplace).
   for (const proto::OpProto::Var& input : in_vars) {
     const std::string& input_name = input.name();
     const std::string& input_autograd_name = "p_autograd_" + input_name;
@@ -1030,28 +1050,46 @@ static std::string GenerateGradNodeCreationContent(
       const char* GET_MULTI_AUTOGRAD_META_TEMPLATE =
           "  std::vector<egr::AutogradMeta*> %s = "
           "egr::EagerUtils::nullable_autograd_meta(%s);\n";
-      get_autograd_meta_str += paddle::string::Sprintf(
+      get_input_autograd_meta_str += paddle::string::Sprintf(
           GET_MULTI_AUTOGRAD_META_TEMPLATE, input_autograd_name, input_name);
 
     } else if (input.dispensable()) {
       const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
           "  egr::AutogradMeta* %s = "
           "egr::EagerUtils::nullable_autograd_meta(%s);\n";
-      get_autograd_meta_str += paddle::string::Sprintf(
+      get_input_autograd_meta_str += paddle::string::Sprintf(
           GET_SINGLE_AUTOGRAD_META_TEMPLATE, input_autograd_name, input_name);
 
     } else {
       const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
           "  egr::AutogradMeta* %s = "
           "egr::EagerUtils::nullable_autograd_meta(%s);\n";
-      get_autograd_meta_str += paddle::string::Sprintf(
+      get_input_autograd_meta_str += paddle::string::Sprintf(
           GET_SINGLE_AUTOGRAD_META_TEMPLATE, input_autograd_name, input_name);
     }
   }
   VLOG(6) << "Generated inputs autograd_meta";
 
+  // check inplace input to avoid inplace operations on leaf nodes with
+  // stop_gradient=False.
+  std::string check_inplace_str = "";
+  if (!inplace_map.empty()) {
+    const char* CHECKING_INPLACE_TEMPLATE =
+        "  // Check Inplace\n"
+        "  egr::EagerUtils::CheckInplace(%s, p_autograd_%s, "
+        "require_any_grad);\n";
+    for (auto& inplace_pair : inplace_map) {
+      std::string inplace_name = inplace_pair.second;
+      check_inplace_str += paddle::string::Sprintf(CHECKING_INPLACE_TEMPLATE,
+                                                   inplace_name, inplace_name);
+    }
+    VLOG(6) << "Check Inplace Input";
+  }
+
   std::string prepare_autograd_meta_str = "";
-  prepare_autograd_meta_str += get_autograd_meta_str;
+  // only generate input autograd_meta in temporary.
+  // output autograd_meta will be generated after running TraceOP.
+  prepare_autograd_meta_str += get_input_autograd_meta_str;
   prepare_autograd_meta_str += "\n";
 
   // [GradOpNode] GetTraceBackward
@@ -1066,7 +1104,7 @@ static std::string GenerateGradNodeCreationContent(
   size_t bwd_in_slot_num = out_vars.size();
   size_t bwd_out_slot_num = in_vars.size();
   const char* GRAD_OP_NODE_TEMPLATE =
-      "    auto grad_node = std::make_shared<GradNode%s>(%d, %d);\n";
+      "      auto grad_node = std::make_shared<GradNode%s>(%d, %d);\n";
   grad_node_creation_str += "    // Create GradOpNode\n";
   grad_node_creation_str += paddle::string::Sprintf(
       GRAD_OP_NODE_TEMPLATE, op_type, bwd_in_slot_num, bwd_out_slot_num);
@@ -1075,14 +1113,14 @@ static std::string GenerateGradNodeCreationContent(
   VLOG(6) << "Generated GradOpNode construction";
 
   // [GradOpNode] Set Attrs
-  grad_node_creation_str += "    // Set Attributes\n";
-  grad_node_creation_str += "    grad_node->SetAttrMap(std::move(attrs));\n";
+  grad_node_creation_str += "      // Set Attributes\n";
+  grad_node_creation_str += "      grad_node->SetAttrMap(std::move(attrs));\n";
   grad_node_creation_str +=
-      "    grad_node->SetDefaultAttrMap(std::move(default_attrs));\n";
+      "      grad_node->SetDefaultAttrMap(std::move(default_attrs));\n";
   grad_node_creation_str += "\n";
 
   // [GradOpNode] Set TensorWrappers
-  grad_node_creation_str += "    // Set Tensor Wrappers\n";
+  grad_node_creation_str += "      // Set Tensor Wrappers\n";
   for (const auto& iter : op_base_infos) {
     const std::map<std::string, std::string>& grad_ins_fwd_slotname_map =
         iter.GetGradInsFwdSlotnameMap();
@@ -1094,10 +1132,18 @@ static std::string GenerateGradNodeCreationContent(
         full_reserved = "true";
       }
       const char* SET_TENSOR_WRAPPER_TEMPLATE =
-          "    grad_node->SetTensorWrapper%s(%s, %s);\n";
-      grad_node_creation_str += paddle::string::Sprintf(
-          SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name, tensor_wrapper_name,
-          full_reserved);
+          "      grad_node->SetTensorWrapper%s(%s, %s);\n";
+      // Replace output directly with input in inplace op.
+      if (!inplace_map.empty() && inplace_map.count(tensor_wrapper_name)) {
+        auto inplace_input_name = inplace_map[tensor_wrapper_name];
+        grad_node_creation_str += paddle::string::Sprintf(
+            SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name,
+            inplace_input_name, full_reserved);
+      } else {
+        grad_node_creation_str += paddle::string::Sprintf(
+            SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name,
+            tensor_wrapper_name, full_reserved);
+      }
     }
   }
   grad_node_creation_str += "\n";
@@ -1115,12 +1161,12 @@ static std::string GenerateGradNodeCreationContent(
       size_t input_position = fwd_inputs_name_pos_map.at(input_name);
 
       const char* SET_GRAD_OUT_META_TEMPLATE =
-          "    grad_node->SetGradOutMeta(%s, %d);\n";
+          "      grad_node->SetGradOutMeta(%s, %d);\n";
       grad_node_creation_str += paddle::string::Sprintf(
           SET_GRAD_OUT_META_TEMPLATE, input_name, input_position);
 
       const char* ADD_EDGES_TEMPLATE =
-          "    if(%s) grad_node->AddEdges(%s, %d);\n";
+          "      if(%s) grad_node->AddEdges(%s, %d);\n";
       grad_node_creation_str +=
           paddle::string::Sprintf(ADD_EDGES_TEMPLATE, input_autograd_name,
                                   input_autograd_name, input_position);
@@ -1129,11 +1175,11 @@ static std::string GenerateGradNodeCreationContent(
       size_t input_position = fwd_inputs_name_pos_map.at(input_name);
 
       const char* SET_GRAD_OUT_META_TEMPLATE =
-          "    grad_node->SetGradOutMeta(%s, %d);\n";
+          "      grad_node->SetGradOutMeta(%s, %d);\n";
       grad_node_creation_str += paddle::string::Sprintf(
           SET_GRAD_OUT_META_TEMPLATE, input_name, input_position);
 
-      const char* ADD_EDGES_TEMPLATE = "    grad_node->AddEdges(&%s, %d);\n";
+      const char* ADD_EDGES_TEMPLATE = "      grad_node->AddEdges(&%s, %d);\n";
       grad_node_creation_str += paddle::string::Sprintf(
           ADD_EDGES_TEMPLATE, input_autograd_name, input_position);
     }
@@ -1145,73 +1191,125 @@ static std::string GenerateGradNodeCreationContent(
   std::string pass_stop_gradient_args = "false";
   for (const proto::OpProto::Var& output : out_vars) {
     const std::string& output_name = output.name();
-    const std::string& output_autograd_name = "p_autograd_" + output_name;
-    size_t output_position = fwd_outputs_name_pos_map.at(output_name);
-
-    // Intermediate Tensor does not require SetHistory, nor RetainGrad
-
-    if (output.duplicable()) {
-      pass_stop_gradient_args += ", &" + output_autograd_name;
+    // Replace output directly with input in inplace op.
+    if (!inplace_map.empty() && inplace_map.count(output_name)) {
+      auto inplace_input_name = inplace_map[output_name];
+      const std::string& inplace_input_autograd_name =
+          "p_autograd_" + inplace_input_name;
+      size_t output_position = fwd_outputs_name_pos_map.at(output_name);
+
+      // Intermediate Tensor does not require SetHistory, nor RetainGrad
+      pass_stop_gradient_args += ", " + inplace_input_autograd_name;
       const char* SET_OUT_RANK_TEMPLATE =
-          "    egr::EagerUtils::SetOutRankWithSlot(&%s, %d);\n";
+          "      egr::EagerUtils::SetOutRankWithSlot(%s, %d);\n";
       grad_node_creation_str += paddle::string::Sprintf(
-          SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position);
+          SET_OUT_RANK_TEMPLATE, inplace_input_autograd_name, output_position);
 
       // Intermediate Tensor does not require SetHistory
       if (!output.intermediate()) {
         const char* SET_HISTORY_TEMPLATE =
-            "    egr::EagerUtils::SetHistory(&%s, grad_node);\n";
-        grad_node_creation_str +=
-            paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name);
+            "      egr::EagerUtils::SetHistory(%s, grad_node);\n";
+        grad_node_creation_str += paddle::string::Sprintf(
+            SET_HISTORY_TEMPLATE, inplace_input_autograd_name);
       }
       const char* SET_GRAD_IN_META_TEMPLATE =
-          "    grad_node->SetGradInMeta(%s, %d);\n";
+          "      grad_node->SetGradInMeta(%s, %d);\n";
       grad_node_creation_str += paddle::string::Sprintf(
-          SET_GRAD_IN_META_TEMPLATE, output_name, output_position);
+          SET_GRAD_IN_META_TEMPLATE, inplace_input_name, output_position);
 
+      // Intermediate Tensor does not require CheckAndRetainGrad
+      if (!output.intermediate()) {
+        VLOG(6) << "Generated Call RetainGradForTensor";
+        const char* RETAIN_GRAD_TEMPLATE =
+            "      egr::EagerUtils::CheckAndRetainGrad(%s);\n";
+        grad_node_creation_str +=
+            paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, inplace_input_name);
+      }
     } else {
-      pass_stop_gradient_args += ", " + output_autograd_name;
-      const char* SET_OUT_RANK_TEMPLATE =
-          "    egr::EagerUtils::SetOutRankWithSlot(%s, %d);\n";
-      grad_node_creation_str += paddle::string::Sprintf(
-          SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position);
+      const std::string& output_autograd_name = "p_autograd_" + output_name;
+      size_t output_position = fwd_outputs_name_pos_map.at(output_name);
 
-      // Intermediate Tensor does not require SetHistory
+      // Intermediate Tensor does not require SetHistory, nor RetainGrad
+
+      if (output.duplicable()) {
+        pass_stop_gradient_args += ", &" + output_autograd_name;
+        const char* SET_OUT_RANK_TEMPLATE =
+            "      egr::EagerUtils::SetOutRankWithSlot(&%s, %d);\n";
+        grad_node_creation_str += paddle::string::Sprintf(
+            SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position);
+
+        // Intermediate Tensor does not require SetHistory
+        if (!output.intermediate()) {
+          const char* SET_HISTORY_TEMPLATE =
+              "      egr::EagerUtils::SetHistory(&%s, grad_node);\n";
+          grad_node_creation_str += paddle::string::Sprintf(
+              SET_HISTORY_TEMPLATE, output_autograd_name);
+        }
+        const char* SET_GRAD_IN_META_TEMPLATE =
+            "      grad_node->SetGradInMeta(%s, %d);\n";
+        grad_node_creation_str += paddle::string::Sprintf(
+            SET_GRAD_IN_META_TEMPLATE, output_name, output_position);
+
+      } else {
+        pass_stop_gradient_args += ", " + output_autograd_name;
+        const char* SET_OUT_RANK_TEMPLATE =
+            "      egr::EagerUtils::SetOutRankWithSlot(%s, %d);\n";
+        grad_node_creation_str += paddle::string::Sprintf(
+            SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position);
+
+        // Intermediate Tensor does not require SetHistory
+        if (!output.intermediate()) {
+          const char* SET_HISTORY_TEMPLATE =
+              "      egr::EagerUtils::SetHistory(%s, grad_node);\n";
+          grad_node_creation_str += paddle::string::Sprintf(
+              SET_HISTORY_TEMPLATE, output_autograd_name);
+        }
+        const char* SET_GRAD_IN_META_TEMPLATE =
+            "      grad_node->SetGradInMeta(%s, %d);\n";
+        grad_node_creation_str += paddle::string::Sprintf(
+            SET_GRAD_IN_META_TEMPLATE, output_name, output_position);
+      }
+
+      // Intermediate Tensor does not require CheckAndRetainGrad
       if (!output.intermediate()) {
-        const char* SET_HISTORY_TEMPLATE =
-            "    egr::EagerUtils::SetHistory(%s, grad_node);\n";
+        VLOG(6) << "Generated Call RetainGradForTensor";
+        const char* RETAIN_GRAD_TEMPLATE =
+            "      egr::EagerUtils::CheckAndRetainGrad(%s);\n";
         grad_node_creation_str +=
-            paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name);
+            paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, output_name);
       }
-      const char* SET_GRAD_IN_META_TEMPLATE =
-          "    grad_node->SetGradInMeta(%s, %d);\n";
-      grad_node_creation_str += paddle::string::Sprintf(
-          SET_GRAD_IN_META_TEMPLATE, output_name, output_position);
-    }
-
-    // Intermediate Tensor does not require CheckAndRetainGrad
-    if (!output.intermediate()) {
-      VLOG(6) << "Generated Call RetainGradForTensor";
-      const char* RETAIN_GRAD_TEMPLATE =
-          "    egr::EagerUtils::CheckAndRetainGrad(%s);\n";
-      grad_node_creation_str +=
-          paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, output_name);
     }
   }
   VLOG(6) << "Generated SetGradIn/OutMeta";
 
   // [Generation] GradNode Creation
+  // After getting require_any_grad, firstly use CheckInplace method for inplace
+  // op.
+  // Then execute TraceOp and generate output autograd_meta.
+  // Finally, Construct GradNode. (Replace output directly with input in inplace
+  // op.)
+  // Add event record
+  std::string event_name = op_type + " node_creation";
   const char* GRAD_NODE_CREATION_TEMPLATE =
-      "  %s"
+      "%s"
       "  bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(%s);\n"
-      "  if(require_any_grad) {\n"
-      "    VLOG(6) << \" Construct Grad for %s \"; \n"
-      "    egr::EagerUtils::PassStopGradient(%s);\n"
-      "%s\n  }";
+      "%s\n"
+      "%s"
+      "  {\n"
+      "    paddle::platform::RecordEvent node_creation_record_event(\"%s\", "
+      "paddle::platform::TracerEventType::Operator, 1);\n"
+      "%s"
+      "    if(require_any_grad) {\n"
+      "      VLOG(6) << \" Construct Grad for %s \"; \n"
+      "      egr::EagerUtils::PassStopGradient(%s);\n"
+      "  %s\n"
+      "    }\n"
+      "  }";
   std::string grad_node_creation_body_str = paddle::string::Sprintf(
       GRAD_NODE_CREATION_TEMPLATE, prepare_autograd_meta_str,
-      compute_require_grad_args, op_type, pass_stop_gradient_args,
-      grad_node_creation_str);
+      compute_require_grad_args, check_inplace_str, trace_op_body_str,
+      event_name, get_output_autograd_meta_str, op_type,
+      pass_stop_gradient_args, grad_node_creation_str);
 
   return grad_node_creation_body_str;
 }
@@ -1221,7 +1319,8 @@ static std::string GenerateGradNodeCreationContent(
 /* -------------------------------- */
 static std::pair<std::string, std::string> GenerateForwardFunctionContents(
     const ForwardGenerationInfo& fwd_info,
-    const GradNodeGenerationInfo& bwd_info) {
+    const GradNodeGenerationInfo& bwd_info,
+    std::map<std::string, std::string> inplace_map = {}) {
   /* --- Process Forward Info ---*/
   const std::string& op_type = fwd_info.GetOpType();
   const std::unordered_map<std::string, size_t>& fwd_inputs_name_pos_map =
@@ -1301,8 +1400,21 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
 
       core_ops_args_type_info[op_type][input_position] = "list";
     } else {
-      const char* FWD_INS_ARG_TEMPLATE =
-          "const paddle::experimental::Tensor& %s";
+      // inplace tensor can't be const
+      const char* FWD_INS_ARG_TEMPLATE;
+      bool flag_find_input_name = false;
+      if (!inplace_map.empty()) {
+        for (auto& inplace_pair : inplace_map) {
+          if (inplace_pair.second == input_name) {
+            flag_find_input_name = true;
+            FWD_INS_ARG_TEMPLATE = "paddle::experimental::Tensor& %s";
+            break;
+          }
+        }
+      }
+      if (!flag_find_input_name) {
+        FWD_INS_ARG_TEMPLATE = "const paddle::experimental::Tensor& %s";
+      }
       input_args_str_list[input_position] =
           paddle::string::Sprintf(FWD_INS_ARG_TEMPLATE, input_name);
 
@@ -1362,6 +1474,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
 
   // [Generation] Get Outs Map
   std::string outs_contents_str = "";
+  std::string inplace_mapping_str = "";
   for (const proto::OpProto::Var& output : out_vars) {
     const std::string& output_name = output.name();
     std::string outnum = "1";
@@ -1404,6 +1517,22 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
       }
       core_ops_args_info[op_type].push_back(output_var_name);
 
+    } else if (!inplace_map.empty() && inplace_map.count(output_name)) {
+      // In inplace op, replace the output with the input directly.
+      PADDLE_ENFORCE_NE(
+          inplace_map[output_name], "",
+          paddle::platform::errors::InvalidArgument(
+              "Inplace op %s has no input corresponding to output %s.", op_type,
+              output_name));
+      const char* FWD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", ins[\"%s\"] },";
+      auto inplace_input_name = inplace_map[output_name];
+      outs_contents_str += paddle::string::Sprintf(
+          FWD_OUTS_CONTENT_TEMPLATE, output_name, inplace_input_name);
+
+      // inplace_map used in TraceOp.
+      const char* INPLACE_MAPPING_TEMPLATE = R"({"%s", "%s"},)";
+      inplace_mapping_str += paddle::string::Sprintf(
+          INPLACE_MAPPING_TEMPLATE, inplace_input_name, output_name);
     } else {
       if (output.duplicable()) {
         outnum = output_name + "Num";
@@ -1430,6 +1559,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   }
   if (outs_contents_str.size() > 0)
     outs_contents_str.pop_back();  // Remove trailing ","
+  if (inplace_mapping_str.size() > 0)
+    inplace_mapping_str.pop_back();  // Remove trailing ","
 
   const char* FWD_OUTS_MAP_TEMPLATE =
       "  std::map<std::string, "
@@ -1463,6 +1594,12 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   dygraph_function_args_str +=
       ", const paddle::framework::AttributeMap& attr_map";
 
+  /* --------- Generate TraceOp ----- */
+  // TraceOp should be run after compute require_any_grad. (for checking
+  // inplace)
+  // `trace_op_body_str` will be passed as a parameter to
+  // `GenerateGradNodeCreationContent`.
+  std::string trace_op_body_str = "";
   // [Generation] Get TraceOp
   const char* FWD_TRACE_OP_TEMPLATE =
       "  paddle::framework::AttributeMap attrs = attr_map;\n"
@@ -1470,11 +1607,12 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
       "  egr::Controller::Instance().GetCurrentTracer()->TraceOp(\"%s\", ins, "
       "outs, attrs, \n"
       "     egr::Controller::Instance().GetExpectedPlace(),\n"
-      "     &default_attrs, true, {});\n";
-  std::string trace_op_str =
-      paddle::string::Sprintf(FWD_TRACE_OP_TEMPLATE, op_type);
-  generated_function_body += trace_op_str;
-  generated_function_body += "\n";
+      "     &default_attrs, true, {%s});\n";
+  std::string trace_op_str = paddle::string::Sprintf(
+      FWD_TRACE_OP_TEMPLATE, op_type, inplace_mapping_str);
+
+  trace_op_body_str += trace_op_str;
+  trace_op_body_str += "\n";
 
   VLOG(6) << "Generated AttrMap & TraceOp";
 
@@ -1539,48 +1677,64 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
               output_varname, output_var_args_name);
         }
       } else {
-        const char* FWD_OUT_TENSOR_TEMPLATE =
-            "  paddle::experimental::Tensor %s;\n"
-            "  egr::EagerUtils::GetOutput(outs[\"%s\"][0], &%s);\n";
-        out_tensor_str =
-            paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE, output_varname,
-                                    output_name, output_varname);
+        if (!inplace_map.empty() && inplace_map.count(output_name)) {
+          // Modify meta info of inplace tensor.
+          // Bump inplace version of inplace tensor.
+          auto inplace_input_name = inplace_map[output_name];
+          const char* FWD_OUT_TENSOR_TEMPLATE =
+              "  egr::EagerUtils::ModifyInplaceInput(outs[\"%s\"][0], &%s);\n"
+              "  %s.bump_inplace_version();\n"
+              "  VLOG(3) << \"Tensor(\" << %s.name() << \") uses Inplace "
+              "Strategy.\";\n";
+          out_tensor_str = paddle::string::Sprintf(
+              FWD_OUT_TENSOR_TEMPLATE, output_name, inplace_input_name,
+              inplace_input_name, inplace_input_name);
+        } else {
+          const char* FWD_OUT_TENSOR_TEMPLATE =
+              "  paddle::experimental::Tensor %s;\n"
+              "  egr::EagerUtils::GetOutput(outs[\"%s\"][0], &%s);\n";
+          out_tensor_str =
+              paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE, output_varname,
+                                      output_name, output_varname);
+        }
       }
       return_types[return_position] = "paddle::experimental::Tensor";
     }
 
-    return_contents[return_position] = output_varname;
-    generated_function_body += out_tensor_str;
+    if (!inplace_map.empty() && inplace_map.count(output_name)) {
+      // Replace output directly with input in inplace op.
+      return_contents[return_position] = inplace_map[output_name];
+    } else {
+      return_contents[return_position] = output_varname;
+    }
+    trace_op_body_str += out_tensor_str;
   }
-  generated_function_body += "\n";
+  trace_op_body_str += "\n";
   VLOG(6) << "Converted Output VarBase to EagerVariable(s)";
+  /* ------ END Generate TraceOp ----- */
 
   // [Generation] Handle core_ops_returns_info
-  core_ops_returns_info[op_type] = return_contents;
+  // avoid inplace op changing core_ops_returns_info
+  if (core_ops_returns_info.empty() || !core_ops_returns_info.count(op_type)) {
+    core_ops_returns_info[op_type] = return_contents;
+  }
 
   // [Generation] ComputeRequireGrad -> GradNodeCreation
 
   if (!bwd_info.GenerateForwardOnly()) {
-    std::string grad_node_creation_body_str =
-        GenerateGradNodeCreationContent(fwd_info, bwd_info);
-
-    // Add event record
-    std::string event_name = op_type + " node_creation";
-    const char* NODE_CREATION_TEMPLATE =
-        "{\n"
-        "   paddle::platform::RecordEvent node_creation_record_event(\"%s\", "
-        "paddle::platform::TracerEventType::Operator, 1);\n"
-        "   %s\n"
-        "}";
-
-    grad_node_creation_body_str = paddle::string::Sprintf(
-        NODE_CREATION_TEMPLATE, event_name, grad_node_creation_body_str);
+    // If GradNode needs to be generated, pass `trace_op_body_str`
+    // into `GenerateGradNodeCreationContent`.
+    std::string grad_node_creation_body_str = GenerateGradNodeCreationContent(
+        fwd_info, bwd_info, trace_op_body_str, inplace_map);
 
     generated_function_body += grad_node_creation_body_str;
     generated_function_body += "\n";
 
     // [Generation] Call RetainGradForTensor
     VLOG(6) << "Generated GradNode Creation codes";
+  } else {
+    // If GradNode doesn't need to be generated, generate TraceOP directly.
+    generated_function_body += trace_op_body_str;
   }
 
   // [Generation] Handle return: Tuple/Vector/Tensor
@@ -1627,7 +1781,13 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   VLOG(6) << "Generated return codes";
 
   // [Generation] Get Full Function
-  std::string function_name = op_type + "_dygraph_function";
+  std::string function_name;
+  if (inplace_map.empty()) {
+    function_name = op_type + "_dygraph_function";
+  } else {
+    // change function_name for inplace op.
+    function_name = op_type + "__dygraph_function";
+  }
 
   if (dygraph_function_args_str.size() > 0) {
     auto iter = dygraph_function_args_str.begin();
@@ -1635,15 +1795,15 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   }
 
   const char* DYGRAPH_FUNCTION_EVENT_RECORD_FUNCTION_TEMPLATE =
-      "paddle::platform::RecordEvent dygraph_entrance_record_event(\"%s\", "
+      "  paddle::platform::RecordEvent dygraph_entrance_record_event(\"%s\", "
       "paddle::platform::TracerEventType::Operator, 1);";
   std::string event_name = op_type + " dygraph";
   std::string fwd_record_event_str = paddle::string::Sprintf(
       DYGRAPH_FUNCTION_EVENT_RECORD_FUNCTION_TEMPLATE, event_name);
   const char* FWD_FUNCTION_TEMPLATE =
       "%s %s(%s) {\n\n"
-      " %s\n"
-      " %s\n"
+      "%s\n"
+      "%s\n"
       "}\n\n";
   std::string fwd_function_str = paddle::string::Sprintf(
       FWD_FUNCTION_TEMPLATE, function_proto_return_type_str, function_name,
@@ -2426,7 +2586,7 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
     /* --------------------------- */
     VLOG(6) << "-------- GenerateForwardFunctionContents -------";
     std::pair<std::string, std::string> body_and_declaration =
-        GenerateForwardFunctionContents(fwd_info, bwd_info);
+        GenerateForwardFunctionContents(fwd_info, bwd_info, {});
 
     fwd_function_str += body_and_declaration.first + "\n";
 
@@ -2434,6 +2594,30 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
     std::string fwd_function_declare_str = body_and_declaration.second;
     dygraph_forward_api_str += fwd_function_declare_str;
 
+    auto& infer_inplace =
+        paddle::framework::OpInfoMap::Instance().Get(op_type).infer_inplace_;
+    std::map<std::string, std::string> inplace_map;
+    // Inplace Function Generator.
+    // `sum` op has duplicate input. Don't consider adding inplace strategy
+    // for `sum` in temporary.
+    if (op_type != "sum" && infer_inplace) {
+      auto in_to_outs = infer_inplace(true);
+      for (auto& inplace_pair : in_to_outs) {
+        inplace_map[inplace_pair.second] = inplace_pair.first;
+      }
+
+      VLOG(6) << "-------- GenerateInplaceForwardFunctionContents -------";
+      std::pair<std::string, std::string> inplace_body_and_declaration =
+          GenerateForwardFunctionContents(fwd_info, bwd_info, inplace_map);
+
+      fwd_function_str += inplace_body_and_declaration.first + "\n";
+
+      VLOG(6) << "-------- GenerateInplaceDygraphForwardAPIContents -------";
+      std::string inplace_fwd_function_declare_str =
+          inplace_body_and_declaration.second;
+      dygraph_forward_api_str += inplace_fwd_function_declare_str;
+    }
+
     if (bwd_info.GenerateForwardOnly()) continue;
 
     VLOG(6) << "-------- GenerateGradNodeHeaderContents -------";
diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h
index 0e11444b815..8da27f3bb8a 100644
--- a/paddle/fluid/eager/tensor_wrapper.h
+++ b/paddle/fluid/eager/tensor_wrapper.h
@@ -36,6 +36,15 @@ class TensorWrapper {
   explicit TensorWrapper(const paddle::experimental::Tensor& tensor,
                          bool full_reserved = false,
                          bool no_need_buffer = false) {
+    // set inplace_version_snapshot_ according to tensor's current inplace
+    // version.
+    if (tensor.impl() && phi::DenseTensor::classof(tensor.impl().get())) {
+      phi::DenseTensor* dense_tensor =
+          static_cast<phi::DenseTensor*>(tensor.impl().get());
+      auto& inplace_version_counter = dense_tensor->InplaceVersionCounter();
+      inplace_version_snapshot_ = inplace_version_counter.CurrentVersion();
+    }
+
     /**
      * Normally, we should fully reserved all non-output or non-leaf fwd tensor
      * here. And for fwd output tensor, we should not reserve its autogradmeta,
@@ -49,6 +58,7 @@ class TensorWrapper {
     }
 
     // shallow copy tensor_impl here
+    no_need_buffer_ = no_need_buffer;
     if (no_need_buffer) {
       if (phi::DenseTensor::classof(tensor.impl().get())) {
         // Only Copy Meta
@@ -86,6 +96,7 @@ class TensorWrapper {
 
     // if it's full_reserved just return the full copy of tensor
     if (full_reserved_) {
+      check_inplace_version();
       return intermidiate_tensor_;
     } else {
       std::shared_ptr<GradNodeBase> new_grad_node = grad_node;
@@ -94,15 +105,52 @@ class TensorWrapper {
       intermidiate_tensor_.set_autograd_meta(
           std::static_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
               p_ab_autograd_meta));
+      check_inplace_version();
       return intermidiate_tensor_;
     }
   }
 
+  void check_inplace_version() {
+    if (no_need_buffer_) {
+      VLOG(6) << "There's no need to check inplace_version because "
+                 "no_need_buffer_ is true.";
+      return;
+    }
+    if (intermidiate_tensor_.impl() &&
+        phi::DenseTensor::classof(intermidiate_tensor_.impl().get())) {
+      phi::DenseTensor* dense_tensor =
+          static_cast<phi::DenseTensor*>(intermidiate_tensor_.impl().get());
+      auto& inplace_version_counter = dense_tensor->InplaceVersionCounter();
+
+      uint32_t current_inplace_version =
+          inplace_version_counter.CurrentVersion();
+      PADDLE_ENFORCE_EQ(
+          current_inplace_version, inplace_version_snapshot_,
+          paddle::platform::errors::PermissionDenied(
+              "Tensor '%s' used in gradient computation has been "
+              "modified by an inplace operation. "
+              "Its version is %d but the expected version is %d. "
+              "Please fix your code to void calling an inplace operator "
+              "after using the Tensor which will used in gradient "
+              "computation.",
+              intermidiate_tensor_.name(), current_inplace_version,
+              inplace_version_snapshot_));
+      VLOG(6) << " The inplace_version_snapshot_ of Tensor '"
+              << intermidiate_tensor_.name() << "' is [ "
+              << inplace_version_snapshot_ << " ]";
+      VLOG(6) << " The current_inplace_version of Tensor '"
+              << intermidiate_tensor_.name() << "' is [ "
+              << current_inplace_version << " ]";
+    }
+  }
+
   void clear() { intermidiate_tensor_.reset(); }
 
  private:
   bool full_reserved_ = false;
+  bool no_need_buffer_ = false;
   std::pair<size_t, size_t> out_rank_info_;
   paddle::experimental::Tensor intermidiate_tensor_;
+  uint32_t inplace_version_snapshot_ = 0;
 };
 }  // namespace egr
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index 8a57d269453..048087903a4 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -212,6 +212,27 @@ std::vector<std::shared_ptr<EagerVariable>> EagerUtils::CreateVars(
   return res;
 }
 
+void EagerUtils::ModifyInplaceInput(
+    const std::shared_ptr<EagerVariable>& inplace_variable,
+    paddle::experimental::Tensor* inplace_tensor) {
+  // Only modify the meta information of the inplace tensor, because
+  // EagerVariable cannot modify Tensor's meta information after inplace
+  // op (such as ``reshape``) is executed.
+  PADDLE_ENFORCE_NOT_NULL(inplace_tensor,
+                          paddle::platform::errors::Fatal(
+                              "Inplace Tensor is null and cannot be modified. "
+                              "We are tring to Modify Inplace Input from its "
+                              "shared_ptr, this error may indicate the inplace "
+                              " input is nullptr"));
+  if (phi::DenseTensor::classof(inplace_variable->GetTensorBase().get())) {
+    phi::DenseTensor* variable_dense_tensor =
+        static_cast<phi::DenseTensor*>(inplace_variable->GetTensorBase().get());
+    phi::DenseTensor* tensor_dense_tensor =
+        static_cast<phi::DenseTensor*>(inplace_tensor->impl().get());
+    tensor_dense_tensor->set_meta(variable_dense_tensor->meta());
+  }
+}
+
 std::vector<paddle::experimental::Tensor> EagerUtils::GetOutputs(
     const std::vector<std::shared_ptr<EagerVariable>>& outs) {
   std::vector<paddle::experimental::Tensor> res;
diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h
index fa5735e6f32..fbd080ef70e 100644
--- a/paddle/fluid/eager/utils.h
+++ b/paddle/fluid/eager/utils.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include "paddle/fluid/eager/api/utils/tensor_utils.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/eager/grad_node_info.h"
@@ -144,6 +145,19 @@ class EagerUtils {
     iter.apply(std::forward<Args>(args)...);
   }
 
+  static void CheckInplace(const paddle::experimental::Tensor& target,
+                           const AutogradMeta* autograd_meta,
+                           bool require_any_grad) {
+    if (require_any_grad && autograd_meta) {
+      PADDLE_ENFORCE_EQ(!autograd_meta->StopGradient() &&
+                            egr::egr_utils_api::IsLeafTensor(target),
+                        false, paddle::platform::errors::InvalidArgument(
+                                   "Leaf Var (%s) that doesn't stop gradient "
+                                   "can't use inplace strategy.",
+                                   target.name()));
+    }
+  }
+
   // TensorWrapper Utils
   static paddle::experimental::Tensor RecoverTensorWrapper(
       TensorWrapper* tw, const std::shared_ptr<GradNodeBase>& grad_node);
@@ -171,6 +185,9 @@ class EagerUtils {
   static std::vector<std::shared_ptr<EagerVariable>> CreateVars(
       const size_t num);
   // Construct Tensor From var
+  static void ModifyInplaceInput(
+      const std::shared_ptr<EagerVariable>& inplace_variable,
+      paddle::experimental::Tensor* inplace_tensor);
   static std::vector<paddle::experimental::Tensor> GetOutputs(
       const std::vector<std::shared_ptr<EagerVariable>>& outs);
   static paddle::experimental::Tensor GetOutput(
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index d4bbfa0e66e..e0a3931c3e3 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -718,6 +718,15 @@ static PyObject* set_grad_type(TensorObject* self, PyObject* args,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static PyObject* tensor__inplace_version(TensorObject* self, PyObject* args,
+                                         PyObject* kwargs) {
+  EAGER_TRY
+  uint32_t inplace_version = self->tensor.current_inplace_version();
+
+  return ToPyObject(inplace_version);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 PyMethodDef variable_methods[] = {
     {"numpy", (PyCFunction)(void (*)(void))tensor_method_numpy,
      METH_VARARGS | METH_KEYWORDS, NULL},
@@ -766,6 +775,8 @@ PyMethodDef variable_methods[] = {
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"_set_grad_type", (PyCFunction)(void (*)(void))set_grad_type,
      METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_inplace_version", (PyCFunction)(void (*)(void))tensor__inplace_version,
+     METH_VARARGS | METH_KEYWORDS, NULL},
     {NULL, NULL, 0, NULL}};
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/eager_op_function_generator.cc b/paddle/fluid/pybind/eager_op_function_generator.cc
index 102cdbb91ab..685e20aef25 100644
--- a/paddle/fluid/pybind/eager_op_function_generator.cc
+++ b/paddle/fluid/pybind/eager_op_function_generator.cc
@@ -162,17 +162,22 @@ static inline std::string TempName(const std::string& name) {
 
 std::string GenerateOpFunctionsBody(
     const paddle::framework::proto::OpProto* op_proto, std::string func_name,
-    bool use_inplace_strategy = false,
     std::map<std::string, std::string> inplace_map = {}) {
   auto& op_type = op_proto->type();
   std::string input_args = "";
-  std::string call_api_str = "auto out = " + op_type + "_dygraph_function(";
+  std::string call_api_str = "";
   std::string ins_initializer_with_null = "";
   std::string py_arg = "";
   int arg_idx = 0;
   int input_args_num = 0;
   std::string ins_cast_str = "";
   std::string view_strategy_str = "";
+  if (!inplace_map.empty()) {
+    // change call_api_str for inplace op
+    call_api_str = "auto out = " + op_type + "__dygraph_function(";
+  } else {
+    call_api_str = "auto out = " + op_type + "_dygraph_function(";
+  }
   for (auto& input : op_proto->inputs()) {
     auto& in_name = input.name();
     // skip those dispensable inputs, like ResidualData in conv2d
@@ -288,8 +293,31 @@ std::string GenerateOpFunctionsBody(
         HANDLE_VIEW_BETWEEN_INPUT_AND_OUTPUT, viwe_input_name, viwe_output_name,
         viwe_input_name, viwe_output_name);
   }
-
-  return_str = "return ToPyObject(out);";
+  if (!inplace_map.empty()) {
+    // For inplace op, Use the input PyObject directly.
+    for (auto& inplace_pair : inplace_map) {
+      // Find index of inplace tensor, and directly use input PyObject.
+      std::string inplace_arg_name = inplace_pair.second;
+      std::string inplace_return_name = inplace_pair.first;
+      const char* RETURN_INPLACE_TENSOR_TEMPLATE =
+          "ssize_t arg_id = GetIdxFromCoreOpsInfoMap(core_ops_args_info, "
+          "\"%s\", \"%s\");\n"
+          "    ssize_t return_id = "
+          "GetIdxFromCoreOpsInfoMap(core_ops_returns_info, \"%s\", \"%s\");\n"
+          "    return ToPyObject(out, return_id, args, arg_id);";
+      return_str = paddle::string::Sprintf(RETURN_INPLACE_TENSOR_TEMPLATE,
+                                           op_type, inplace_arg_name, op_type,
+                                           inplace_return_name);
+      // only support one inplace_var in temporary.
+      PADDLE_ENFORCE_EQ(
+          inplace_map.size(), 1,
+          paddle::platform::errors::InvalidArgument(
+              "size of inplace_map must be 1, but got %d", inplace_map.size()));
+      break;
+    }
+  } else {
+    return_str = "return ToPyObject(out);";
+  }
 
   std::string function_args = "";
   if (input_args == "") {
@@ -383,7 +411,8 @@ GenerateOpFunctions() {
       continue;
     }
     std::string func_name = "eager_api_" + op_type;
-    std::string op_function_str = GenerateOpFunctionsBody(op_proto, func_name);
+    std::string op_function_str =
+        GenerateOpFunctionsBody(op_proto, func_name, {});
 
     // generate pybind item
     auto bind_function_str = paddle::string::Sprintf(
@@ -391,6 +420,40 @@ GenerateOpFunctions() {
 
     op_function_list.emplace_back(std::move(op_function_str));
     bind_function_list.emplace_back(std::move(bind_function_str));
+
+    // NOTE(pangyoki): Inplace Strategy.
+    // In this case, output will reuse input varbase.
+    // Dygraph mode needs to be aligned with the in-place strategy in static
+    // mode, and the mapping relationships between output and input that have
+    // been defined in static mode should be used in dygraph mode.
+    // Find which ops need to use Inplace strategy in static mode, and get the
+    // mapping relationship between Inplace output and input.
+    auto& infer_inplace =
+        paddle::framework::OpInfoMap::Instance().Get(op_type).infer_inplace_;
+    std::map<std::string, std::string> inplace_map;
+    // `sum` op has duplicate input. Don't consider adding inplace strategy
+    // for `sum` in temporary.
+    if (op_type != "sum" && infer_inplace) {
+      // Inplace OP: op_type_.
+      // The inplace OP needs a new implementation method.
+      auto in_to_outs = infer_inplace(true);
+      for (auto& inplace_pair : in_to_outs) {
+        inplace_map[inplace_pair.second] = inplace_pair.first;
+      }
+
+      std::string inplace_op_type = op_type + "_";
+      std::string inplace_func_name = "eager_api_" + inplace_op_type;
+      std::string inplace_op_function_str =
+          GenerateOpFunctionsBody(op_proto, inplace_func_name, inplace_map);
+
+      // generate pybind item
+      auto inplace_bind_function_str =
+          paddle::string::Sprintf(PYBIND_ITEM_TEMPLATE, inplace_op_type,
+                                  inplace_func_name, inplace_op_type);
+
+      op_function_list.emplace_back(std::move(inplace_op_function_str));
+      bind_function_list.emplace_back(std::move(inplace_bind_function_str));
+    }
   }
   if (append_custom_head_file) {
     op_function_list.emplace_back(CUSTOM_HANDWRITE_OP_FUNC_FILE);
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 97bb32630d7..a23bb1230e1 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -417,6 +417,8 @@ PyObject* ToPyObject(bool value) {
 
 PyObject* ToPyObject(int value) { return PyLong_FromLong(value); }
 
+PyObject* ToPyObject(uint32_t value) { return PyLong_FromUnsignedLong(value); }
+
 PyObject* ToPyObject(int64_t value) { return PyLong_FromLongLong(value); }
 
 PyObject* ToPyObject(float value) { return PyLong_FromDouble(value); }
@@ -442,6 +444,20 @@ PyObject* ToPyObject(const paddle::experimental::Tensor& value) {
   return obj;
 }
 
+PyObject* ToPyObject(const paddle::experimental::Tensor& value,
+                     ssize_t value_idx, PyObject* args, ssize_t arg_idx) {
+  // For inplace op, directly return the input PyObject of the inplace tensor.
+  // [Parameter]
+  // value: Useless parameter.
+  // value_idx: Useless parameter.
+  // args: Input PyObject.
+  // arg_idx: Index of inplace PyObject in input args. Used to find the input
+  // inplace PyObject.
+  PyObject* obj = PyTuple_GET_ITEM(args, arg_idx);
+  Py_INCREF(obj);
+  return obj;
+}
+
 PyObject* ToPyObject(const std::vector<bool>& value) {
   PyObject* result = PyList_New((Py_ssize_t)value.size());
 
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index 1c4e2ab69a5..fba1485bcf4 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -56,6 +56,7 @@ framework::proto::VarType::Type CastPyArg2ProtoType(PyObject* obj,
                                                     ssize_t arg_pos);
 
 PyObject* ToPyObject(int value);
+PyObject* ToPyObject(uint32_t value);
 PyObject* ToPyObject(bool value);
 PyObject* ToPyObject(int64_t value);
 PyObject* ToPyObject(float value);
@@ -63,6 +64,8 @@ PyObject* ToPyObject(double value);
 PyObject* ToPyObject(const char* value);
 PyObject* ToPyObject(const std::string& value);
 PyObject* ToPyObject(const paddle::experimental::Tensor& value);
+PyObject* ToPyObject(const paddle::experimental::Tensor& value,
+                     ssize_t value_idx, PyObject* args, ssize_t arg_idx);
 PyObject* ToPyObject(const std::vector<bool>& value);
 PyObject* ToPyObject(const std::vector<int>& value);
 PyObject* ToPyObject(const std::vector<int64_t>& value);
@@ -84,6 +87,17 @@ struct TupleTensorResult {
     TupleTensorResult<Tuple, N - 1>::Run(out, result);
     PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get<N - 1>(out)));
   }
+
+  static void Run(const Tuple& out, PyObject* result, ssize_t value_idx,
+                  PyObject* args, ssize_t arg_idx) {
+    TupleTensorResult<Tuple, N - 1>::Run(out, result, value_idx, args, arg_idx);
+    if (N - 1 == value_idx) {
+      PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get<N - 1>(out),
+                                                 value_idx, args, arg_idx));
+    } else {
+      PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get<N - 1>(out)));
+    }
+  }
 };
 
 template <typename Tuple>
@@ -91,6 +105,16 @@ struct TupleTensorResult<Tuple, 1> {
   static void Run(const Tuple& out, PyObject* result) {
     PyTuple_SET_ITEM(result, 0, ToPyObject(std::get<0>(out)));
   }
+
+  static void Run(const Tuple& out, PyObject* result, ssize_t value_idx,
+                  PyObject* args, ssize_t arg_idx) {
+    if (value_idx == 0) {
+      PyTuple_SET_ITEM(result, 0,
+                       ToPyObject(std::get<0>(out), value_idx, args, arg_idx));
+    } else {
+      PyTuple_SET_ITEM(result, 0, ToPyObject(std::get<0>(out)));
+    }
+  }
 };
 
 template <typename... Args>
@@ -103,6 +127,26 @@ PyObject* ToPyObject(const std::tuple<Args...>& out) {
   return result;
 }
 
+template <typename... Args>
+PyObject* ToPyObject(const std::tuple<Args...>& out, ssize_t value_idx,
+                     PyObject* args, ssize_t arg_idx) {
+  // For inplace op, directly return the input PyObject of the inplace tensor.
+  // [Parameter]
+  // out: Outputs tuple after executing op.
+  // value_idx: Index of inplace tensor in outputs tuple. Used to find the
+  // output inplace tensor.
+  // args: Input PyObject.
+  // arg_idx: Index of inplace PyObject in input args. Used to find the input
+  // inplace PyObject.
+  auto len = sizeof...(Args);
+  PyObject* result = PyTuple_New(len);
+
+  TupleTensorResult<decltype(out), sizeof...(Args)>::Run(out, result, value_idx,
+                                                         args, arg_idx);
+
+  return result;
+}
+
 paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj,
                                               const std::string& op_type,
                                               ssize_t arg_pos);
diff --git a/paddle/fluid/pybind/op_function_common.cc b/paddle/fluid/pybind/op_function_common.cc
index 09c3cea398b..1d483abd774 100644
--- a/paddle/fluid/pybind/op_function_common.cc
+++ b/paddle/fluid/pybind/op_function_common.cc
@@ -854,5 +854,30 @@ void InitOpsAttrTypeMap() {
   }
 }
 
+ssize_t GetIdxFromCoreOpsInfoMap(
+    const std::unordered_map<std::string, std::vector<std::string>>&
+        core_ops_info_map,
+    const std::string& op_type, const std::string& name) {
+  // `core_ops_info_map` can be `core_ops_args_info` or `core_ops_returns_info`.
+  // `core_ops_args_info`: get index from core_ops_args_info[op_type] according
+  // to input name.
+  // `core_ops_returns_info`: get index from core_ops_returns_info[op_type]
+  // according to return name.
+  if (!core_ops_info_map.count(op_type)) {
+    PADDLE_THROW(platform::errors::Fatal(
+        "Op %s is not found in core_ops_*_info map.", op_type));
+  } else {
+    auto args_list = core_ops_info_map.at(op_type);
+    auto it = std::find(args_list.begin(), args_list.end(), name);
+    if (it == args_list.end()) {
+      PADDLE_THROW(platform::errors::Fatal("%s is not found in op %s's args.",
+                                           name, op_type));
+    } else {
+      return std::distance(args_list.begin(), it);
+    }
+  }
+  return -1;
+}
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/op_function_common.h b/paddle/fluid/pybind/op_function_common.h
index 7ead9852667..33d0e242a02 100644
--- a/paddle/fluid/pybind/op_function_common.h
+++ b/paddle/fluid/pybind/op_function_common.h
@@ -146,5 +146,10 @@ unsigned long GetUnsignedLongFromArgs(  // NOLINT
 
 void InitOpsAttrTypeMap();
 
+ssize_t GetIdxFromCoreOpsInfoMap(
+    const std::unordered_map<std::string, std::vector<std::string>>&
+        core_ops_info_map,
+    const std::string& op_type, const std::string& name);
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h
index ce40627bb0d..eae8d12fb37 100644
--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -481,7 +481,21 @@ class PADDLE_API Tensor final {
    */
   void set_autograd_meta(std::shared_ptr<AbstractAutogradMeta> autograd_meta);
 
-  /* Part 9: Auto generated Tensor methods */
+  /* Part 9: Inplace methods */
+
+  /**
+   * @brief Increase inplace version
+   */
+  void bump_inplace_version();
+
+  /**
+   * @brief Get current inplace version
+   *
+   * @return uint32_t
+   */
+  uint32_t current_inplace_version();
+
+  /* Part 10: Auto generated Tensor methods */
 
  private:
   /**
diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc
index 6be85d72000..6090e6a400a 100644
--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -347,5 +347,36 @@ void Tensor::set_autograd_meta(
   autograd_meta_ = std::move(autograd_meta);
 }
 
+void Tensor::bump_inplace_version() {
+  if (is_dense_tensor()) {
+    auto &inplace_version_counter =
+        std::dynamic_pointer_cast<phi::DenseTensor>(impl_)
+            ->InplaceVersionCounter();
+    VLOG(3) << "yoki: before bump inplace version: "
+            << inplace_version_counter.CurrentVersion();
+    inplace_version_counter.Bump();
+    VLOG(3) << "yoki: after bump inplace version: "
+            << inplace_version_counter.CurrentVersion();
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "bump_inplace_version is only supported on DenseTensor now."));
+  }
+}
+
+uint32_t Tensor::current_inplace_version() {
+  if (is_dense_tensor()) {
+    auto &inplace_version_counter =
+        std::dynamic_pointer_cast<phi::DenseTensor>(impl_)
+            ->InplaceVersionCounter();
+    VLOG(3) << "yoki: print version: "
+            << inplace_version_counter.CurrentVersion();
+    return inplace_version_counter.CurrentVersion();
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "current_inplace_version is only supported on DenseTensor now."));
+  }
+  return 0;
+}
+
 }  // namespace experimental
 }  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index c82172780b7..44e6f8e8f2a 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -960,6 +960,7 @@ set_tests_properties(test_bicubic_interp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_deformable_conv_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_nearest_interp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_profiler PROPERTIES TIMEOUT 120)
+set_tests_properties(test_inplace_eager_fluid PROPERTIES TIMEOUT 120)
 set_tests_properties(test_inplace_softmax_with_cross_entropy PROPERTIES TIMEOUT 120)
 set_tests_properties(test_cross_entropy2_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_fetch_unmerged PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py b/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py
new file mode 100644
index 00000000000..a434c562000
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py
@@ -0,0 +1,397 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid.core as core
+from paddle.fluid.framework import _test_eager_guard
+
+
+class TestDygraphInplace(unittest.TestCase):
+    def setUp(self):
+        self.init_data()
+        self.set_np_compare_func()
+
+    def init_data(self):
+        self.input_var_numpy = np.random.uniform(-5, 5, [10, 20, 1])
+        self.dtype = "float32"
+
+    def set_np_compare_func(self):
+        self.np_compare = np.array_equal
+
+    def non_inplace_api_processing(self, var):
+        return paddle.squeeze(var)
+
+    def inplace_api_processing(self, var):
+        return paddle.squeeze_(var)
+
+    def test_inplace_api(self):
+        with _test_eager_guard():
+            var = paddle.to_tensor(self.input_var_numpy).astype(self.dtype)
+            inplace_var = self.inplace_api_processing(var)
+            self.assertTrue(id(var) == id(inplace_var))
+
+            inplace_var.exp_()
+            self.assertTrue(np.array_equal(var.numpy(), inplace_var.numpy()))
+
+    def test_forward_version(self):
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var = paddle.to_tensor(self.input_var_numpy).astype(self.dtype)
+                self.assertEqual(var.inplace_version, 0)
+
+                inplace_var = self.inplace_api_processing(var)
+                self.assertEqual(var.inplace_version, 1)
+
+                inplace_var.exp_()
+                self.assertEqual(var.inplace_version, 2)
+
+                inplace_var = self.inplace_api_processing(inplace_var)
+                self.assertEqual(var.inplace_version, 3)
+
+    def test_leaf_inplace_var_error(self):
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var = paddle.to_tensor(self.input_var_numpy).astype(self.dtype)
+                var.stop_gradient = False
+
+                def leaf_inplace_error():
+                    self.inplace_api_processing(var)
+
+                self.assertRaises(ValueError, leaf_inplace_error)
+
+    def test_backward_error(self):
+        # It raises an error because the inplace operator will result
+        # in incorrect gradient computation.
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.to_tensor(self.input_var_numpy).astype(
+                    self.dtype)
+                var_a.stop_gradient = False
+
+                var_b = var_a**2
+
+                # Here, the gradient computation will use the value of var_b
+                var_c = var_b**2
+                self.inplace_api_processing(var_b)
+
+                loss = paddle.nn.functional.relu(var_c)
+                with self.assertRaisesRegexp(
+                        RuntimeError,
+                        "received current_inplace_version:{} != inplace_version_snapshot_:{}".
+                        format(1, 0)):
+                    loss.backward()
+
+    def test_backward_success_1(self):
+        # var_b is modified inplace before using it, the inplace operator doesn't result
+        # in incorrect gradient computation.
+        grad_var_a, grad_var_a_inplace = 0, 1
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.to_tensor(self.input_var_numpy).astype(
+                    self.dtype)
+                var_a.stop_gradient = False
+
+                var_b = var_a**2
+                var_c = self.inplace_api_processing(
+                    var_b)  # var_b is modified inplace before using it
+
+                # Here, the gradient computation will use the value of var_b
+                var_d = var_c**2
+                loss = var_d.sum()
+                loss.backward()
+                grad_var_a_inplace = var_a.grad.numpy()
+
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.to_tensor(self.input_var_numpy).astype(
+                    self.dtype)
+                var_a.stop_gradient = False
+
+                var_b = var_a**2
+                var_c = self.non_inplace_api_processing(var_b)
+                var_d = var_c**2
+                loss = var_d.sum()
+                loss.backward()
+                grad_var_a = var_a.grad.numpy()
+
+        self.assertTrue(self.np_compare(grad_var_a_inplace, grad_var_a))
+
+    def test_backward_success_2(self):
+        # Although var_b is modified inplace after using it, it does not used in gradient computation.
+        # The inplace operator doesn't result in incorrect gradient computation.
+        grad_var_a, grad_var_a_inplace = 0, 1
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.to_tensor(self.input_var_numpy).astype(
+                    self.dtype)
+                var_a.stop_gradient = False
+
+                var_b = var_a**2
+
+                var_c = self.inplace_api_processing(
+                    var_b)  # var_b is modified inplace before using it
+
+                var_d = var_c + var_c  # Here, the grad op of sum doesn't use the value of var_b
+                loss = var_d.sum()
+
+                loss.backward()
+                grad_var_a_inplace = var_a.grad.numpy()
+
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.to_tensor(self.input_var_numpy).astype(
+                    self.dtype)
+                var_a.stop_gradient = False
+
+                var_b = var_a**2
+
+                var_c = self.non_inplace_api_processing(
+                    var_b)  # var_b is modified inplace before using it
+
+                var_d = var_c + var_c  # Here, the grad op of sum doesn't use the value of var_b
+                loss = var_d.sum()
+
+                loss.backward()
+                grad_var_a = var_a.grad.numpy()
+        self.assertTrue(np.array_equal(grad_var_a_inplace, grad_var_a))
+
+
+class TestDygraphInplaceUnsqueeze(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return paddle.unsqueeze(var, -1)
+
+    def inplace_api_processing(self, var):
+        return paddle.unsqueeze_(var, -1)
+
+
+class TestDygraphInplaceReshape(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return paddle.reshape(var, [-1])
+
+    def inplace_api_processing(self, var):
+        return paddle.reshape_(var, [-1])
+
+
+class TestDygraphInplaceFlatten(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.flatten()
+
+    def inplace_api_processing(self, var):
+        return var.flatten_()
+
+
+class TestDygraphInplaceScatter(TestDygraphInplace):
+    def init_data(self):
+        self.input_var_numpy = np.array([[1, 1], [2, 2], [3, 3]])
+        self.dtype = "float32"
+
+    def non_inplace_api_processing(self, var):
+        index = paddle.to_tensor([2, 1, 0, 1], dtype='int64')
+        updates = paddle.to_tensor(
+            [[1, 1], [2, 2], [3, 3], [4, 4]], dtype='float32')
+
+        return paddle.scatter(var, index, updates, overwrite=False)
+
+    def inplace_api_processing(self, var):
+        index = paddle.to_tensor([2, 1, 0, 1], dtype='int64')
+        updates = paddle.to_tensor(
+            [[1, 1], [2, 2], [3, 3], [4, 4]], dtype='float32')
+
+        return paddle.scatter_(var, index, updates, overwrite=False)
+
+
+class TestDygraphInplaceElu(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return paddle.nn.functional.elu(var)
+
+    def inplace_api_processing(self, var):
+        return paddle.nn.functional.elu_(var)
+
+
+class TestDygraphInplaceRelu(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return paddle.nn.functional.relu(var)
+
+    def inplace_api_processing(self, var):
+        return paddle.nn.functional.relu_(var)
+
+
+class TestDygraphInplaceSoftmax(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return paddle.nn.functional.softmax(var)
+
+    def inplace_api_processing(self, var):
+        return paddle.nn.functional.softmax_(var)
+
+
+class TestDygraphInplaceTanh(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return paddle.tanh(var)
+
+    def inplace_api_processing(self, var):
+        return paddle.tanh_(var)
+
+
+class TestDygraphInplaceCeil(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.ceil()
+
+    def inplace_api_processing(self, var):
+        return var.ceil_()
+
+
+class TestDygraphInplaceFloor(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.floor()
+
+    def inplace_api_processing(self, var):
+        return var.floor_()
+
+
+class TestDygraphInplaceExp(TestDygraphInplace):
+    def set_np_compare_func(self):
+        self.np_compare = np.allclose
+
+    def non_inplace_api_processing(self, var):
+        return var.exp()
+
+    def inplace_api_processing(self, var):
+        return var.exp_()
+
+
+class TestDygraphInplaceReciprocal(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.reciprocal()
+
+    def inplace_api_processing(self, var):
+        return var.reciprocal_()
+
+
+class TestDygraphInplaceRound(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.round()
+
+    def inplace_api_processing(self, var):
+        return var.round_()
+
+
+class TestDygraphInplaceSqrt(TestDygraphInplace):
+    def init_data(self):
+        self.input_var_numpy = np.random.uniform(0, 5, [10, 20, 1])
+        self.dtype = "float32"
+
+    def non_inplace_api_processing(self, var):
+        return var.sqrt()
+
+    def inplace_api_processing(self, var):
+        return var.sqrt_()
+
+
+class TestDygraphInplaceRsqrt(TestDygraphInplaceSqrt):
+    def non_inplace_api_processing(self, var):
+        return var.rsqrt()
+
+    def inplace_api_processing(self, var):
+        return var.rsqrt_()
+
+
+class TestDygraphInplaceClip(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.clip(0.6, 1.5)
+
+    def inplace_api_processing(self, var):
+        return var.clip_(0.6, 1.5)
+
+
+class TestDygraphInplaceScale(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.scale(scale=2.0, bias=3.0)
+
+    def inplace_api_processing(self, var):
+        return var.scale_(scale=2.0, bias=3.0)
+
+
+class TestDygraphInplaceAdd(TestDygraphInplace):
+    def init_data(self):
+        self.input_var_numpy = np.random.rand(2, 3, 4)
+        self.dtype = "float32"
+        self.input_var_numpy_2 = np.random.rand(2, 3, 4).astype(self.dtype)
+
+    def non_inplace_api_processing(self, var):
+        input_var_2 = paddle.to_tensor(self.input_var_numpy_2)
+        return var.add(input_var_2)
+
+    def inplace_api_processing(self, var):
+        input_var_2 = paddle.to_tensor(self.input_var_numpy_2)
+        return var.add_(input_var_2)
+
+
+class TestDygraphInplaceSubtract(TestDygraphInplaceAdd):
+    def non_inplace_api_processing(self, var):
+        input_var_2 = paddle.to_tensor(self.input_var_numpy_2)
+        return var.subtract(input_var_2)
+
+    def inplace_api_processing(self, var):
+        input_var_2 = paddle.to_tensor(self.input_var_numpy_2)
+        return var.subtract_(input_var_2)
+
+
+class TestLossIsInplaceVar(unittest.TestCase):
+    def test_loss_is_inplace_var(self):
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.ones((2, 2))
+                var_a.stop_gradient = False
+
+                var_b = var_a * 2
+                loss = var_b.tanh_()
+
+                loss.backward()
+                inplace_grad_var_a = var_a.grad.numpy()
+
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.ones((2, 2))
+                var_a.stop_gradient = False
+
+                var_b = var_a * 2
+                loss = var_b.tanh()
+
+                loss.backward()
+                grad_var_a = var_a.grad.numpy()
+
+        self.assertTrue(np.array_equal(inplace_grad_var_a, grad_var_a))
+
+
+class TestContinuouslyInplace(unittest.TestCase):
+    def test_continuously_inplace(self):
+        with _test_eager_guard():
+            a = paddle.rand([2, 3])
+            a.stop_gradient = False
+            b = a * 2
+
+            b.reshape_([-1])
+            b.reshape_([2, 3])
+            b.reshape_([-1])
+
+            b.backward()
+
+
+if __name__ == '__main__':
+    unittest.main()
-- 
GitLab