support inplace in dygraph eager_fluid state (#40400)

* [Eager] Support eager grad interface, draft version * Support eager grad interface with allow_unused and multi startup_op * Fix code format * Fix allow_unused case, return PyNone if tensor not initialize * Support output's stop_gradient related to create_graph * Support grad exception case in eager mode, fix coverage CI * Update ToPyObject, return PyNone if not initialize * AccumulationNode add FLAGS_retain_grad_for_all_tensor * Fix ci issue * Fix CI issue * fix, use core.eager.Tensor * Add func SetBufferSlotRankZeros for GradTensorHolder * Support retain_graph by using ClearTensorWrappers * Support retain_graph by using ClearTensorWrappers * Update retain_graph and no_grad_vars related test case * Update code gen logic for ClearTensorWrappers * Fix by override statement * fix override func args * Support retain_graph, update unit tests * Updated ClearTensorWrappers logic * fix grad python interface * Use deep copy and update unit tests * Polish code * Polish code * Fix CI issue, Deep copy only use when user set grad_tensors * Fix CI, use Backward instead RunBackward * Fix CI, Declare kernel explicitly in test file * Polish, remove vector of TensorWrapper * Refactor the logic of grad/backward, polish codes * Update code after merge upstream develop * Polish after merge upstream develop * Update to adapt new GradNodeBase superclass * Fix error introduced during conflict resolution * support inplace strategy in eager_fluid state * solve conflict * nothing * Update purify potential_startup_nodes logic * Fix errors * Polish code * Remove useless args for ToPyObject * Remove useless TensorWrappersSet * fix record conflict * Fix code-format, re-install pre-commit * fix tensor_wrapper bug * Fix pre-process logic for potential_startup_ops * Update unit tests, use eager mode * Fix conflicts * fix unittest timeout * little change Co-authored-by: N Weilong Wu <veyron_wu@163.com>

support inplace in dygraph eager_fluid state (#40400)
* [Eager] Support eager grad interface, draft version * Support eager grad interface with allow_unused and multi startup_op * Fix code format * Fix allow_unused case, return PyNone if tensor not initialize * Support output's stop_gradient related to create_graph * Support grad exception case in eager mode, fix coverage CI * Update ToPyObject, return PyNone if not initialize * AccumulationNode add FLAGS_retain_grad_for_all_tensor * Fix ci issue * Fix CI issue * fix, use core.eager.Tensor * Add func SetBufferSlotRankZeros for GradTensorHolder * Support retain_graph by using ClearTensorWrappers * Support retain_graph by using ClearTensorWrappers * Update retain_graph and no_grad_vars related test case * Update code gen logic for ClearTensorWrappers * Fix by override statement * fix override func args * Support retain_graph, update unit tests * Updated ClearTensorWrappers logic * fix grad python interface * Use deep copy and update unit tests * Polish code * Polish code * Fix CI issue, Deep copy only use when user set grad_tensors * Fix CI, use Backward instead RunBackward * Fix CI, Declare kernel explicitly in test file * Polish, remove vector of TensorWrapper * Refactor the logic of grad/backward, polish codes * Update code after merge upstream develop * Polish after merge upstream develop * Update to adapt new GradNodeBase superclass * Fix error introduced during conflict resolution * support inplace strategy in eager_fluid state * solve conflict * nothing * Update purify potential_startup_nodes logic * Fix errors * Polish code * Remove useless args for ToPyObject * Remove useless TensorWrappersSet * fix record conflict * Fix code-format, re-install pre-commit * fix tensor_wrapper bug * Fix pre-process logic for potential_startup_ops * Update unit tests, use eager mode * Fix conflicts * fix unittest timeout * little change Co-authored-by: N Weilong Wu <veyron_wu@163.com>
8e612903 · pangyoki · GitHub · 50fad3ed · 8e612903 · 8e612903
15 changed file
--- a/paddle/fluid/eager/api/utils/tensor_utils.cc
+++ b/paddle/fluid/eager/api/utils/tensor_utils.cc
@@ -30,7 +30,8 @@ namespace egr_utils_api {

 bool IsLeafTensor(const paddle::experimental::Tensor& target) {
  std::shared_ptr<GradNodeBase> grad_node = EagerUtils::grad_node(target);
-  if (std::dynamic_pointer_cast<GradNodeAccumulation>(grad_node)) {
+  if (!grad_node ||
+      std::dynamic_pointer_cast<GradNodeAccumulation>(grad_node)) {
    return true;
  }


--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
--- a/paddle/fluid/eager/tensor_wrapper.h
+++ b/paddle/fluid/eager/tensor_wrapper.h
@@ -36,6 +36,15 @@ class TensorWrapper {
  explicit TensorWrapper(const paddle::experimental::Tensor& tensor,
                         bool full_reserved = false,
                         bool no_need_buffer = false) {
+    // set inplace_version_snapshot_ according to tensor's current inplace
+    // version.
+    if (tensor.impl() && phi::DenseTensor::classof(tensor.impl().get())) {
+      phi::DenseTensor* dense_tensor =
+          static_cast<phi::DenseTensor*>(tensor.impl().get());
+      auto& inplace_version_counter = dense_tensor->InplaceVersionCounter();
+      inplace_version_snapshot_ = inplace_version_counter.CurrentVersion();
+    }
+
    /**
     * Normally, we should fully reserved all non-output or non-leaf fwd tensor
     * here. And for fwd output tensor, we should not reserve its autogradmeta,
@@ -49,6 +58,7 @@ class TensorWrapper {
    }

    // shallow copy tensor_impl here
+    no_need_buffer_ = no_need_buffer;
    if (no_need_buffer) {
      if (phi::DenseTensor::classof(tensor.impl().get())) {
        // Only Copy Meta
@@ -86,6 +96,7 @@ class TensorWrapper {

    // if it's full_reserved just return the full copy of tensor
    if (full_reserved_) {
+      check_inplace_version();
      return intermidiate_tensor_;
    } else {
      std::shared_ptr<GradNodeBase> new_grad_node = grad_node;
@@ -94,15 +105,52 @@ class TensorWrapper {
      intermidiate_tensor_.set_autograd_meta(
          std::static_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
              p_ab_autograd_meta));
+      check_inplace_version();
      return intermidiate_tensor_;
    }
  }

+  void check_inplace_version() {
+    if (no_need_buffer_) {
+      VLOG(6) << "There's no need to check inplace_version because "
+                 "no_need_buffer_ is true.";
+      return;
+    }
+    if (intermidiate_tensor_.impl() &&
+        phi::DenseTensor::classof(intermidiate_tensor_.impl().get())) {
+      phi::DenseTensor* dense_tensor =
+          static_cast<phi::DenseTensor*>(intermidiate_tensor_.impl().get());
+      auto& inplace_version_counter = dense_tensor->InplaceVersionCounter();
+
+      uint32_t current_inplace_version =
+          inplace_version_counter.CurrentVersion();
+      PADDLE_ENFORCE_EQ(
+          current_inplace_version, inplace_version_snapshot_,
+          paddle::platform::errors::PermissionDenied(
+              "Tensor '%s' used in gradient computation has been "
+              "modified by an inplace operation. "
+              "Its version is %d but the expected version is %d. "
+              "Please fix your code to void calling an inplace operator "
+              "after using the Tensor which will used in gradient "
+              "computation.",
+              intermidiate_tensor_.name(), current_inplace_version,
+              inplace_version_snapshot_));
+      VLOG(6) << " The inplace_version_snapshot_ of Tensor '"
+              << intermidiate_tensor_.name() << "' is [ "
+              << inplace_version_snapshot_ << " ]";
+      VLOG(6) << " The current_inplace_version of Tensor '"
+              << intermidiate_tensor_.name() << "' is [ "
+              << current_inplace_version << " ]";
+    }
+  }
+
  void clear() { intermidiate_tensor_.reset(); }

 private:
  bool full_reserved_ = false;
+  bool no_need_buffer_ = false;
  std::pair<size_t, size_t> out_rank_info_;
  paddle::experimental::Tensor intermidiate_tensor_;
+  uint32_t inplace_version_snapshot_ = 0;
 };
 }  // namespace egr
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -212,6 +212,27 @@ std::vector<std::shared_ptr<EagerVariable>> EagerUtils::CreateVars(
  return res;
 }

+void EagerUtils::ModifyInplaceInput(
+    const std::shared_ptr<EagerVariable>& inplace_variable,
+    paddle::experimental::Tensor* inplace_tensor) {
+  // Only modify the meta information of the inplace tensor, because
+  // EagerVariable cannot modify Tensor's meta information after inplace
+  // op (such as ``reshape``) is executed.
+  PADDLE_ENFORCE_NOT_NULL(inplace_tensor,
+                          paddle::platform::errors::Fatal(
+                              "Inplace Tensor is null and cannot be modified. "
+                              "We are tring to Modify Inplace Input from its "
+                              "shared_ptr, this error may indicate the inplace "
+                              " input is nullptr"));
+  if (phi::DenseTensor::classof(inplace_variable->GetTensorBase().get())) {
+    phi::DenseTensor* variable_dense_tensor =
+        static_cast<phi::DenseTensor*>(inplace_variable->GetTensorBase().get());
+    phi::DenseTensor* tensor_dense_tensor =
+        static_cast<phi::DenseTensor*>(inplace_tensor->impl().get());
+    tensor_dense_tensor->set_meta(variable_dense_tensor->meta());
+  }
+}
+
 std::vector<paddle::experimental::Tensor> EagerUtils::GetOutputs(
    const std::vector<std::shared_ptr<EagerVariable>>& outs) {
  std::vector<paddle::experimental::Tensor> res;

--- a/paddle/fluid/eager/utils.h
+++ b/paddle/fluid/eager/utils.h
@@ -14,6 +14,7 @@

 #pragma once

+#include "paddle/fluid/eager/api/utils/tensor_utils.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/eager/grad_node_info.h"
@@ -144,6 +145,19 @@ class EagerUtils {
    iter.apply(std::forward<Args>(args)...);
  }

+  static void CheckInplace(const paddle::experimental::Tensor& target,
+                           const AutogradMeta* autograd_meta,
+                           bool require_any_grad) {
+    if (require_any_grad && autograd_meta) {
+      PADDLE_ENFORCE_EQ(!autograd_meta->StopGradient() &&
+                            egr::egr_utils_api::IsLeafTensor(target),
+                        false, paddle::platform::errors::InvalidArgument(
+                                   "Leaf Var (%s) that doesn't stop gradient "
+                                   "can't use inplace strategy.",
+                                   target.name()));
+    }
+  }
+
  // TensorWrapper Utils
  static paddle::experimental::Tensor RecoverTensorWrapper(
      TensorWrapper* tw, const std::shared_ptr<GradNodeBase>& grad_node);
@@ -171,6 +185,9 @@ class EagerUtils {
  static std::vector<std::shared_ptr<EagerVariable>> CreateVars(
      const size_t num);
  // Construct Tensor From var
+  static void ModifyInplaceInput(
+      const std::shared_ptr<EagerVariable>& inplace_variable,
+      paddle::experimental::Tensor* inplace_tensor);
  static std::vector<paddle::experimental::Tensor> GetOutputs(
      const std::vector<std::shared_ptr<EagerVariable>>& outs);
  static paddle::experimental::Tensor GetOutput(

--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -718,6 +718,15 @@ static PyObject* set_grad_type(TensorObject* self, PyObject* args,
  EAGER_CATCH_AND_THROW_RETURN_NULL
 }

+static PyObject* tensor__inplace_version(TensorObject* self, PyObject* args,
+                                         PyObject* kwargs) {
+  EAGER_TRY
+  uint32_t inplace_version = self->tensor.current_inplace_version();
+
+  return ToPyObject(inplace_version);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 PyMethodDef variable_methods[] = {
    {"numpy", (PyCFunction)(void (*)(void))tensor_method_numpy,
     METH_VARARGS | METH_KEYWORDS, NULL},
@@ -766,6 +775,8 @@ PyMethodDef variable_methods[] = {
     METH_VARARGS | METH_KEYWORDS, NULL},
    {"_set_grad_type", (PyCFunction)(void (*)(void))set_grad_type,
     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_inplace_version", (PyCFunction)(void (*)(void))tensor__inplace_version,
+     METH_VARARGS | METH_KEYWORDS, NULL},
    {NULL, NULL, 0, NULL}};

 }  // namespace pybind

--- a/paddle/fluid/pybind/eager_op_function_generator.cc
+++ b/paddle/fluid/pybind/eager_op_function_generator.cc
@@ -162,17 +162,22 @@ static inline std::string TempName(const std::string& name) {

 std::string GenerateOpFunctionsBody(
    const paddle::framework::proto::OpProto* op_proto, std::string func_name,
-    bool use_inplace_strategy = false,
    std::map<std::string, std::string> inplace_map = {}) {
  auto& op_type = op_proto->type();
  std::string input_args = "";
-  std::string call_api_str = "auto out = " + op_type + "_dygraph_function(";
+  std::string call_api_str = "";
  std::string ins_initializer_with_null = "";
  std::string py_arg = "";
  int arg_idx = 0;
  int input_args_num = 0;
  std::string ins_cast_str = "";
  std::string view_strategy_str = "";
+  if (!inplace_map.empty()) {
+    // change call_api_str for inplace op
+    call_api_str = "auto out = " + op_type + "__dygraph_function(";
+  } else {
+    call_api_str = "auto out = " + op_type + "_dygraph_function(";
+  }
  for (auto& input : op_proto->inputs()) {
    auto& in_name = input.name();
    // skip those dispensable inputs, like ResidualData in conv2d
@@ -288,8 +293,31 @@ std::string GenerateOpFunctionsBody(
        HANDLE_VIEW_BETWEEN_INPUT_AND_OUTPUT, viwe_input_name, viwe_output_name,
        viwe_input_name, viwe_output_name);
  }
-
-  return_str = "return ToPyObject(out);";
+  if (!inplace_map.empty()) {
+    // For inplace op, Use the input PyObject directly.
+    for (auto& inplace_pair : inplace_map) {
+      // Find index of inplace tensor, and directly use input PyObject.
+      std::string inplace_arg_name = inplace_pair.second;
+      std::string inplace_return_name = inplace_pair.first;
+      const char* RETURN_INPLACE_TENSOR_TEMPLATE =
+          "ssize_t arg_id = GetIdxFromCoreOpsInfoMap(core_ops_args_info, "
+          "\"%s\", \"%s\");\n"
+          "    ssize_t return_id = "
+          "GetIdxFromCoreOpsInfoMap(core_ops_returns_info, \"%s\", \"%s\");\n"
+          "    return ToPyObject(out, return_id, args, arg_id);";
+      return_str = paddle::string::Sprintf(RETURN_INPLACE_TENSOR_TEMPLATE,
+                                           op_type, inplace_arg_name, op_type,
+                                           inplace_return_name);
+      // only support one inplace_var in temporary.
+      PADDLE_ENFORCE_EQ(
+          inplace_map.size(), 1,
+          paddle::platform::errors::InvalidArgument(
+              "size of inplace_map must be 1, but got %d", inplace_map.size()));
+      break;
+    }
+  } else {
+    return_str = "return ToPyObject(out);";
+  }

  std::string function_args = "";
  if (input_args == "") {
@@ -383,7 +411,8 @@ GenerateOpFunctions() {
      continue;
    }
    std::string func_name = "eager_api_" + op_type;
-    std::string op_function_str = GenerateOpFunctionsBody(op_proto, func_name);
+    std::string op_function_str =
+        GenerateOpFunctionsBody(op_proto, func_name, {});

    // generate pybind item
    auto bind_function_str = paddle::string::Sprintf(
@@ -391,6 +420,40 @@ GenerateOpFunctions() {

    op_function_list.emplace_back(std::move(op_function_str));
    bind_function_list.emplace_back(std::move(bind_function_str));
+
+    // NOTE(pangyoki): Inplace Strategy.
+    // In this case, output will reuse input varbase.
+    // Dygraph mode needs to be aligned with the in-place strategy in static
+    // mode, and the mapping relationships between output and input that have
+    // been defined in static mode should be used in dygraph mode.
+    // Find which ops need to use Inplace strategy in static mode, and get the
+    // mapping relationship between Inplace output and input.
+    auto& infer_inplace =
+        paddle::framework::OpInfoMap::Instance().Get(op_type).infer_inplace_;
+    std::map<std::string, std::string> inplace_map;
+    // `sum` op has duplicate input. Don't consider adding inplace strategy
+    // for `sum` in temporary.
+    if (op_type != "sum" && infer_inplace) {
+      // Inplace OP: op_type_.
+      // The inplace OP needs a new implementation method.
+      auto in_to_outs = infer_inplace(true);
+      for (auto& inplace_pair : in_to_outs) {
+        inplace_map[inplace_pair.second] = inplace_pair.first;
+      }
+
+      std::string inplace_op_type = op_type + "_";
+      std::string inplace_func_name = "eager_api_" + inplace_op_type;
+      std::string inplace_op_function_str =
+          GenerateOpFunctionsBody(op_proto, inplace_func_name, inplace_map);
+
+      // generate pybind item
+      auto inplace_bind_function_str =
+          paddle::string::Sprintf(PYBIND_ITEM_TEMPLATE, inplace_op_type,
+                                  inplace_func_name, inplace_op_type);
+
+      op_function_list.emplace_back(std::move(inplace_op_function_str));
+      bind_function_list.emplace_back(std::move(inplace_bind_function_str));
+    }
  }
  if (append_custom_head_file) {
    op_function_list.emplace_back(CUSTOM_HANDWRITE_OP_FUNC_FILE);

--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -417,6 +417,8 @@ PyObject* ToPyObject(bool value) {

 PyObject* ToPyObject(int value) { return PyLong_FromLong(value); }

+PyObject* ToPyObject(uint32_t value) { return PyLong_FromUnsignedLong(value); }
+
 PyObject* ToPyObject(int64_t value) { return PyLong_FromLongLong(value); }

 PyObject* ToPyObject(float value) { return PyLong_FromDouble(value); }
@@ -442,6 +444,20 @@ PyObject* ToPyObject(const paddle::experimental::Tensor& value) {
  return obj;
 }

+PyObject* ToPyObject(const paddle::experimental::Tensor& value,
+                     ssize_t value_idx, PyObject* args, ssize_t arg_idx) {
+  // For inplace op, directly return the input PyObject of the inplace tensor.
+  // [Parameter]
+  // value: Useless parameter.
+  // value_idx: Useless parameter.
+  // args: Input PyObject.
+  // arg_idx: Index of inplace PyObject in input args. Used to find the input
+  // inplace PyObject.
+  PyObject* obj = PyTuple_GET_ITEM(args, arg_idx);
+  Py_INCREF(obj);
+  return obj;
+}
+
 PyObject* ToPyObject(const std::vector<bool>& value) {
  PyObject* result = PyList_New((Py_ssize_t)value.size());


--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -56,6 +56,7 @@ framework::proto::VarType::Type CastPyArg2ProtoType(PyObject* obj,
                                                    ssize_t arg_pos);

 PyObject* ToPyObject(int value);
+PyObject* ToPyObject(uint32_t value);
 PyObject* ToPyObject(bool value);
 PyObject* ToPyObject(int64_t value);
 PyObject* ToPyObject(float value);
@@ -63,6 +64,8 @@ PyObject* ToPyObject(double value);
 PyObject* ToPyObject(const char* value);
 PyObject* ToPyObject(const std::string& value);
 PyObject* ToPyObject(const paddle::experimental::Tensor& value);
+PyObject* ToPyObject(const paddle::experimental::Tensor& value,
+                     ssize_t value_idx, PyObject* args, ssize_t arg_idx);
 PyObject* ToPyObject(const std::vector<bool>& value);
 PyObject* ToPyObject(const std::vector<int>& value);
 PyObject* ToPyObject(const std::vector<int64_t>& value);
@@ -84,6 +87,17 @@ struct TupleTensorResult {
    TupleTensorResult<Tuple, N - 1>::Run(out, result);
    PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get<N - 1>(out)));
  }
+
+  static void Run(const Tuple& out, PyObject* result, ssize_t value_idx,
+                  PyObject* args, ssize_t arg_idx) {
+    TupleTensorResult<Tuple, N - 1>::Run(out, result, value_idx, args, arg_idx);
+    if (N - 1 == value_idx) {
+      PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get<N - 1>(out),
+                                                 value_idx, args, arg_idx));
+    } else {
+      PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get<N - 1>(out)));
+    }
+  }
 };

 template <typename Tuple>
@@ -91,6 +105,16 @@ struct TupleTensorResult<Tuple, 1> {
  static void Run(const Tuple& out, PyObject* result) {
    PyTuple_SET_ITEM(result, 0, ToPyObject(std::get<0>(out)));
  }
+
+  static void Run(const Tuple& out, PyObject* result, ssize_t value_idx,
+                  PyObject* args, ssize_t arg_idx) {
+    if (value_idx == 0) {
+      PyTuple_SET_ITEM(result, 0,
+                       ToPyObject(std::get<0>(out), value_idx, args, arg_idx));
+    } else {
+      PyTuple_SET_ITEM(result, 0, ToPyObject(std::get<0>(out)));
+    }
+  }
 };

 template <typename... Args>
@@ -103,6 +127,26 @@ PyObject* ToPyObject(const std::tuple<Args...>& out) {
  return result;
 }

+template <typename... Args>
+PyObject* ToPyObject(const std::tuple<Args...>& out, ssize_t value_idx,
+                     PyObject* args, ssize_t arg_idx) {
+  // For inplace op, directly return the input PyObject of the inplace tensor.
+  // [Parameter]
+  // out: Outputs tuple after executing op.
+  // value_idx: Index of inplace tensor in outputs tuple. Used to find the
+  // output inplace tensor.
+  // args: Input PyObject.
+  // arg_idx: Index of inplace PyObject in input args. Used to find the input
+  // inplace PyObject.
+  auto len = sizeof...(Args);
+  PyObject* result = PyTuple_New(len);
+
+  TupleTensorResult<decltype(out), sizeof...(Args)>::Run(out, result, value_idx,
+                                                         args, arg_idx);
+
+  return result;
+}
+
 paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj,
                                              const std::string& op_type,
                                              ssize_t arg_pos);

--- a/paddle/fluid/pybind/op_function_common.cc
+++ b/paddle/fluid/pybind/op_function_common.cc
@@ -854,5 +854,30 @@ void InitOpsAttrTypeMap() {
  }
 }

+ssize_t GetIdxFromCoreOpsInfoMap(
+    const std::unordered_map<std::string, std::vector<std::string>>&
+        core_ops_info_map,
+    const std::string& op_type, const std::string& name) {
+  // `core_ops_info_map` can be `core_ops_args_info` or `core_ops_returns_info`.
+  // `core_ops_args_info`: get index from core_ops_args_info[op_type] according
+  // to input name.
+  // `core_ops_returns_info`: get index from core_ops_returns_info[op_type]
+  // according to return name.
+  if (!core_ops_info_map.count(op_type)) {
+    PADDLE_THROW(platform::errors::Fatal(
+        "Op %s is not found in core_ops_*_info map.", op_type));
+  } else {
+    auto args_list = core_ops_info_map.at(op_type);
+    auto it = std::find(args_list.begin(), args_list.end(), name);
+    if (it == args_list.end()) {
+      PADDLE_THROW(platform::errors::Fatal("%s is not found in op %s's args.",
+                                           name, op_type));
+    } else {
+      return std::distance(args_list.begin(), it);
+    }
+  }
+  return -1;
+}
+
 }  // namespace pybind
 }  // namespace paddle
--- a/paddle/fluid/pybind/op_function_common.h
+++ b/paddle/fluid/pybind/op_function_common.h
@@ -146,5 +146,10 @@ unsigned long GetUnsignedLongFromArgs(  // NOLINT

 void InitOpsAttrTypeMap();

+ssize_t GetIdxFromCoreOpsInfoMap(
+    const std::unordered_map<std::string, std::vector<std::string>>&
+        core_ops_info_map,
+    const std::string& op_type, const std::string& name);
+
 }  // namespace pybind
 }  // namespace paddle
--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -481,7 +481,21 @@ class PADDLE_API Tensor final {
   */
  void set_autograd_meta(std::shared_ptr<AbstractAutogradMeta> autograd_meta);

-  /* Part 9: Auto generated Tensor methods */
+  /* Part 9: Inplace methods */
+
+  /**
+   * @brief Increase inplace version
+   */
+  void bump_inplace_version();
+
+  /**
+   * @brief Get current inplace version
+   *
+   * @return uint32_t
+   */
+  uint32_t current_inplace_version();
+
+  /* Part 10: Auto generated Tensor methods */

 private:
  /**

--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -347,5 +347,36 @@ void Tensor::set_autograd_meta(
  autograd_meta_ = std::move(autograd_meta);
 }

+void Tensor::bump_inplace_version() {
+  if (is_dense_tensor()) {
+    auto &inplace_version_counter =
+        std::dynamic_pointer_cast<phi::DenseTensor>(impl_)
+            ->InplaceVersionCounter();
+    VLOG(3) << "yoki: before bump inplace version: "
+            << inplace_version_counter.CurrentVersion();
+    inplace_version_counter.Bump();
+    VLOG(3) << "yoki: after bump inplace version: "
+            << inplace_version_counter.CurrentVersion();
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "bump_inplace_version is only supported on DenseTensor now."));
+  }
+}
+
+uint32_t Tensor::current_inplace_version() {
+  if (is_dense_tensor()) {
+    auto &inplace_version_counter =
+        std::dynamic_pointer_cast<phi::DenseTensor>(impl_)
+            ->InplaceVersionCounter();
+    VLOG(3) << "yoki: print version: "
+            << inplace_version_counter.CurrentVersion();
+    return inplace_version_counter.CurrentVersion();
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "current_inplace_version is only supported on DenseTensor now."));
+  }
+  return 0;
+}
+
 }  // namespace experimental
 }  // namespace paddle
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -960,6 +960,7 @@ set_tests_properties(test_bicubic_interp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_deformable_conv_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_nearest_interp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_profiler PROPERTIES TIMEOUT 120)
+set_tests_properties(test_inplace_eager_fluid PROPERTIES TIMEOUT 120)
 set_tests_properties(test_inplace_softmax_with_cross_entropy PROPERTIES TIMEOUT 120)
 set_tests_properties(test_cross_entropy2_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_fetch_unmerged PROPERTIES TIMEOUT 120)

--- a/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid.core as core
+from paddle.fluid.framework import _test_eager_guard
+
+
+class TestDygraphInplace(unittest.TestCase):
+    def setUp(self):
+        self.init_data()
+        self.set_np_compare_func()
+
+    def init_data(self):
+        self.input_var_numpy = np.random.uniform(-5, 5, [10, 20, 1])
+        self.dtype = "float32"
+
+    def set_np_compare_func(self):
+        self.np_compare = np.array_equal
+
+    def non_inplace_api_processing(self, var):
+        return paddle.squeeze(var)
+
+    def inplace_api_processing(self, var):
+        return paddle.squeeze_(var)
+
+    def test_inplace_api(self):
+        with _test_eager_guard():
+            var = paddle.to_tensor(self.input_var_numpy).astype(self.dtype)
+            inplace_var = self.inplace_api_processing(var)
+            self.assertTrue(id(var) == id(inplace_var))
+
+            inplace_var.exp_()
+            self.assertTrue(np.array_equal(var.numpy(), inplace_var.numpy()))
+
+    def test_forward_version(self):
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var = paddle.to_tensor(self.input_var_numpy).astype(self.dtype)
+                self.assertEqual(var.inplace_version, 0)
+
+                inplace_var = self.inplace_api_processing(var)
+                self.assertEqual(var.inplace_version, 1)
+
+                inplace_var.exp_()
+                self.assertEqual(var.inplace_version, 2)
+
+                inplace_var = self.inplace_api_processing(inplace_var)
+                self.assertEqual(var.inplace_version, 3)
+
+    def test_leaf_inplace_var_error(self):
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var = paddle.to_tensor(self.input_var_numpy).astype(self.dtype)
+                var.stop_gradient = False
+
+                def leaf_inplace_error():
+                    self.inplace_api_processing(var)
+
+                self.assertRaises(ValueError, leaf_inplace_error)
+
+    def test_backward_error(self):
+        # It raises an error because the inplace operator will result
+        # in incorrect gradient computation.
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.to_tensor(self.input_var_numpy).astype(
+                    self.dtype)
+                var_a.stop_gradient = False
+
+                var_b = var_a**2
+
+                # Here, the gradient computation will use the value of var_b
+                var_c = var_b**2
+                self.inplace_api_processing(var_b)
+
+                loss = paddle.nn.functional.relu(var_c)
+                with self.assertRaisesRegexp(
+                        RuntimeError,
+                        "received current_inplace_version:{} != inplace_version_snapshot_:{}".
+                        format(1, 0)):
+                    loss.backward()
+
+    def test_backward_success_1(self):
+        # var_b is modified inplace before using it, the inplace operator doesn't result
+        # in incorrect gradient computation.
+        grad_var_a, grad_var_a_inplace = 0, 1
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.to_tensor(self.input_var_numpy).astype(
+                    self.dtype)
+                var_a.stop_gradient = False
+
+                var_b = var_a**2
+                var_c = self.inplace_api_processing(
+                    var_b)  # var_b is modified inplace before using it
+
+                # Here, the gradient computation will use the value of var_b
+                var_d = var_c**2
+                loss = var_d.sum()
+                loss.backward()
+                grad_var_a_inplace = var_a.grad.numpy()
+
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.to_tensor(self.input_var_numpy).astype(
+                    self.dtype)
+                var_a.stop_gradient = False
+
+                var_b = var_a**2
+                var_c = self.non_inplace_api_processing(var_b)
+                var_d = var_c**2
+                loss = var_d.sum()
+                loss.backward()
+                grad_var_a = var_a.grad.numpy()
+
+        self.assertTrue(self.np_compare(grad_var_a_inplace, grad_var_a))
+
+    def test_backward_success_2(self):
+        # Although var_b is modified inplace after using it, it does not used in gradient computation.
+        # The inplace operator doesn't result in incorrect gradient computation.
+        grad_var_a, grad_var_a_inplace = 0, 1
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.to_tensor(self.input_var_numpy).astype(
+                    self.dtype)
+                var_a.stop_gradient = False
+
+                var_b = var_a**2
+
+                var_c = self.inplace_api_processing(
+                    var_b)  # var_b is modified inplace before using it
+
+                var_d = var_c + var_c  # Here, the grad op of sum doesn't use the value of var_b
+                loss = var_d.sum()
+
+                loss.backward()
+                grad_var_a_inplace = var_a.grad.numpy()
+
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.to_tensor(self.input_var_numpy).astype(
+                    self.dtype)
+                var_a.stop_gradient = False
+
+                var_b = var_a**2
+
+                var_c = self.non_inplace_api_processing(
+                    var_b)  # var_b is modified inplace before using it
+
+                var_d = var_c + var_c  # Here, the grad op of sum doesn't use the value of var_b
+                loss = var_d.sum()
+
+                loss.backward()
+                grad_var_a = var_a.grad.numpy()
+        self.assertTrue(np.array_equal(grad_var_a_inplace, grad_var_a))
+
+
+class TestDygraphInplaceUnsqueeze(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return paddle.unsqueeze(var, -1)
+
+    def inplace_api_processing(self, var):
+        return paddle.unsqueeze_(var, -1)
+
+
+class TestDygraphInplaceReshape(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return paddle.reshape(var, [-1])
+
+    def inplace_api_processing(self, var):
+        return paddle.reshape_(var, [-1])
+
+
+class TestDygraphInplaceFlatten(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.flatten()
+
+    def inplace_api_processing(self, var):
+        return var.flatten_()
+
+
+class TestDygraphInplaceScatter(TestDygraphInplace):
+    def init_data(self):
+        self.input_var_numpy = np.array([[1, 1], [2, 2], [3, 3]])
+        self.dtype = "float32"
+
+    def non_inplace_api_processing(self, var):
+        index = paddle.to_tensor([2, 1, 0, 1], dtype='int64')
+        updates = paddle.to_tensor(
+            [[1, 1], [2, 2], [3, 3], [4, 4]], dtype='float32')
+
+        return paddle.scatter(var, index, updates, overwrite=False)
+
+    def inplace_api_processing(self, var):
+        index = paddle.to_tensor([2, 1, 0, 1], dtype='int64')
+        updates = paddle.to_tensor(
+            [[1, 1], [2, 2], [3, 3], [4, 4]], dtype='float32')
+
+        return paddle.scatter_(var, index, updates, overwrite=False)
+
+
+class TestDygraphInplaceElu(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return paddle.nn.functional.elu(var)
+
+    def inplace_api_processing(self, var):
+        return paddle.nn.functional.elu_(var)
+
+
+class TestDygraphInplaceRelu(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return paddle.nn.functional.relu(var)
+
+    def inplace_api_processing(self, var):
+        return paddle.nn.functional.relu_(var)
+
+
+class TestDygraphInplaceSoftmax(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return paddle.nn.functional.softmax(var)
+
+    def inplace_api_processing(self, var):
+        return paddle.nn.functional.softmax_(var)
+
+
+class TestDygraphInplaceTanh(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return paddle.tanh(var)
+
+    def inplace_api_processing(self, var):
+        return paddle.tanh_(var)
+
+
+class TestDygraphInplaceCeil(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.ceil()
+
+    def inplace_api_processing(self, var):
+        return var.ceil_()
+
+
+class TestDygraphInplaceFloor(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.floor()
+
+    def inplace_api_processing(self, var):
+        return var.floor_()
+
+
+class TestDygraphInplaceExp(TestDygraphInplace):
+    def set_np_compare_func(self):
+        self.np_compare = np.allclose
+
+    def non_inplace_api_processing(self, var):
+        return var.exp()
+
+    def inplace_api_processing(self, var):
+        return var.exp_()
+
+
+class TestDygraphInplaceReciprocal(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.reciprocal()
+
+    def inplace_api_processing(self, var):
+        return var.reciprocal_()
+
+
+class TestDygraphInplaceRound(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.round()
+
+    def inplace_api_processing(self, var):
+        return var.round_()
+
+
+class TestDygraphInplaceSqrt(TestDygraphInplace):
+    def init_data(self):
+        self.input_var_numpy = np.random.uniform(0, 5, [10, 20, 1])
+        self.dtype = "float32"
+
+    def non_inplace_api_processing(self, var):
+        return var.sqrt()
+
+    def inplace_api_processing(self, var):
+        return var.sqrt_()
+
+
+class TestDygraphInplaceRsqrt(TestDygraphInplaceSqrt):
+    def non_inplace_api_processing(self, var):
+        return var.rsqrt()
+
+    def inplace_api_processing(self, var):
+        return var.rsqrt_()
+
+
+class TestDygraphInplaceClip(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.clip(0.6, 1.5)
+
+    def inplace_api_processing(self, var):
+        return var.clip_(0.6, 1.5)
+
+
+class TestDygraphInplaceScale(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.scale(scale=2.0, bias=3.0)
+
+    def inplace_api_processing(self, var):
+        return var.scale_(scale=2.0, bias=3.0)
+
+
+class TestDygraphInplaceAdd(TestDygraphInplace):
+    def init_data(self):
+        self.input_var_numpy = np.random.rand(2, 3, 4)
+        self.dtype = "float32"
+        self.input_var_numpy_2 = np.random.rand(2, 3, 4).astype(self.dtype)
+
+    def non_inplace_api_processing(self, var):
+        input_var_2 = paddle.to_tensor(self.input_var_numpy_2)
+        return var.add(input_var_2)
+
+    def inplace_api_processing(self, var):
+        input_var_2 = paddle.to_tensor(self.input_var_numpy_2)
+        return var.add_(input_var_2)
+
+
+class TestDygraphInplaceSubtract(TestDygraphInplaceAdd):
+    def non_inplace_api_processing(self, var):
+        input_var_2 = paddle.to_tensor(self.input_var_numpy_2)
+        return var.subtract(input_var_2)
+
+    def inplace_api_processing(self, var):
+        input_var_2 = paddle.to_tensor(self.input_var_numpy_2)
+        return var.subtract_(input_var_2)
+
+
+class TestLossIsInplaceVar(unittest.TestCase):
+    def test_loss_is_inplace_var(self):
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.ones((2, 2))
+                var_a.stop_gradient = False
+
+                var_b = var_a * 2
+                loss = var_b.tanh_()
+
+                loss.backward()
+                inplace_grad_var_a = var_a.grad.numpy()
+
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                var_a = paddle.ones((2, 2))
+                var_a.stop_gradient = False
+
+                var_b = var_a * 2
+                loss = var_b.tanh()
+
+                loss.backward()
+                grad_var_a = var_a.grad.numpy()
+
+        self.assertTrue(np.array_equal(inplace_grad_var_a, grad_var_a))
+
+
+class TestContinuouslyInplace(unittest.TestCase):
+    def test_continuously_inplace(self):
+        with _test_eager_guard():
+            a = paddle.rand([2, 3])
+            a.stop_gradient = False
+            b = a * 2
+
+            b.reshape_([-1])
+            b.reshape_([2, 3])
+            b.reshape_([-1])
+
+            b.backward()
+
+
+if __name__ == '__main__':
+    unittest.main()