From f824bc0d0e34767b8c81f14a77ef46303e3d6f08 Mon Sep 17 00:00:00 2001
From: HongyuJia <jiahongyu@baidu.com>
Date: Thu, 16 Mar 2023 19:40:21 +0800
Subject: [PATCH] [Custom Operator] Custom op support inplace mechanism
 (#51620)

* init unit test commit, contains register thinking

* support inplace

* get inplaced x.grad

* Try support inplace and hook at the same time

* Support inplace, need debug

* Support inplace successfully

* Inplace use Tensor&, consistent with Tensor*

* fix MapPlainOutputs bug

* fix double grad inplace error
---
 .../custom_operator/custom_operator_node.cc   |  12 +
 paddle/fluid/framework/custom_operator.cc     |  66 +++-
 paddle/fluid/framework/op_meta_info_helper.h  |   4 +
 paddle/fluid/pybind/eager_functions.cc        |  42 +++
 paddle/fluid/pybind/eager_utils.cc            |   3 +-
 paddle/phi/api/ext/op_meta_info.h             |  70 +++-
 paddle/phi/api/lib/op_meta_info.cc            |  82 +++++
 .../fluid/tests/custom_op/CMakeLists.txt      |   1 +
 .../fluid/tests/custom_op/custom_inplace.cc   | 136 +++++++
 .../tests/custom_op/test_custom_inplace.py    | 333 ++++++++++++++++++
 10 files changed, 713 insertions(+), 36 deletions(-)
 create mode 100644 python/paddle/fluid/tests/custom_op/custom_inplace.cc
 create mode 100644 python/paddle/fluid/tests/custom_op/test_custom_inplace.py

diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.cc b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
index df959759816..c99f524b246 100644
--- a/paddle/fluid/eager/custom_operator/custom_operator_node.cc
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
@@ -174,6 +174,9 @@ RunCustomOpNode::operator()(paddle::small_vector<std::vector<paddle::Tensor>,
       egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]);
   auto grad_outputs_names = paddle::framework::OpMetaInfoHelper::GetOutputs(
       egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]);
+  const auto& grad_inplace_map =
+      paddle::framework::OpMetaInfoHelper::GetInplaceMap(
+          egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]);
   auto map = egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type_);
   auto kernel_map = egr::Controller::Instance().GetOpMetaInfoMap();
 
@@ -205,6 +208,9 @@ RunCustomOpNode::operator()(paddle::small_vector<std::vector<paddle::Tensor>,
   }
   VLOG(6) << "Prepare Grad attrs";
   ctx.EmplaceBackAttrs(attrs_);
+  // NOTE(HongyuJia): grad_outputs_names.size() <= OutputMeta().size():
+  // OutputMeta().size() indicates input size of forward op,
+  // grad_outputs_names.size() indicates output size of backward op.
   paddle::small_vector<std::vector<paddle::Tensor>, kSlotSmallVectorSize> outs(
       OutputMeta().size());
   paddle::small_vector<std::vector<paddle::Tensor>, kSlotSmallVectorSize>
@@ -234,8 +240,10 @@ RunCustomOpNode::operator()(paddle::small_vector<std::vector<paddle::Tensor>,
   }
   VLOG(7) << "Run Kernel of Grad Custom Op: " << op_type_ << "_grad";
 
+  ctx.MapPlainOutputs(grad_inputs_name, grad_outputs_names, grad_inplace_map);
   (*paddle::framework::OpMetaInfoHelper::GetKernelFn(
       kernel_map.at(op_type_)[1]))(&ctx);
+  ctx.AssignInplaceOutputs();
 
   VLOG(7) << "Get AutogradMeta for inputs and outputs for Custom Op";
   std::vector<std::vector<egr::AutogradMeta*>> ins_auto_grad_metas;
@@ -353,6 +361,8 @@ RunCustomOpDoubleGradNode::operator()(
       paddle::framework::OpMetaInfoHelper::GetInputs(vec_map[2]);
   auto grad_outputs_names =
       paddle::framework::OpMetaInfoHelper::GetOutputs(vec_map[2]);
+  const auto& grad_inplace_map =
+      paddle::framework::OpMetaInfoHelper::GetInplaceMap(vec_map[2]);
   auto map = egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type_);
   auto kernel_map = egr::Controller::Instance().GetOpMetaInfoMap();
 
@@ -419,8 +429,10 @@ RunCustomOpDoubleGradNode::operator()(
   }
   VLOG(7) << "Run Kernel of Grad Custom Op: " << name();
 
+  ctx.MapPlainOutputs(grad_inputs_name, grad_outputs_names, grad_inplace_map);
   (*paddle::framework::OpMetaInfoHelper::GetKernelFn(
       kernel_map.at(op_type_)[2]))(&ctx);
+  ctx.AssignInplaceOutputs();
 
   return outs;
 }
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 9a834e58323..2ddbf738787 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -130,11 +130,13 @@ static std::vector<std::string> ParseAttrStr(const std::string& attr) {
 ////////////////// Kernel Define ////////////////////
 
 // custom op kernel call function define
-static void RunKernelFunc(const framework::ExecutionContext& ctx,
-                          const paddle::KernelFunc& func,
-                          const std::vector<std::string>& inputs,
-                          const std::vector<std::string>& outputs,
-                          const std::vector<std::string>& attrs) {
+static void RunKernelFunc(
+    const framework::ExecutionContext& ctx,
+    const paddle::KernelFunc& func,
+    const std::vector<std::string>& inputs,
+    const std::vector<std::string>& outputs,
+    const std::vector<std::string>& attrs,
+    const std::unordered_map<std::string, std::string>& inplace_map) {
   VLOG(3) << "Custom Operator: Start run KernelFunc.";
   // prepare CustomOpKernelContext
   paddle::CustomOpKernelContext kernel_ctx;
@@ -283,7 +285,10 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
       VLOG(4) << "Initialize phi tensor operants successfully";
     }
 
+    // handle inplace case
+    kernel_ctx.MapPlainOutputs(inputs, outputs, inplace_map);
     func(&kernel_ctx);
+    kernel_ctx.AssignInplaceOutputs();
 
     // sync output tensor data into original output
     auto* calc_outs = kernel_ctx.AllMutableOutput();
@@ -686,12 +691,14 @@ static void RegisterOperatorKernelWithPlace(
   OperatorWithKernel::AllOpKernels()[name][key] = op_kernel_func;
 }
 
-static void RegisterOperatorKernel(const std::string& name,
-                                   const paddle::KernelFunc& kernel_func,
-                                   const std::vector<std::string>& inputs,
-                                   const std::vector<std::string>& outputs,
-                                   const std::vector<std::string>& attrs,
-                                   void* dso_handle) {
+static void RegisterOperatorKernel(
+    const std::string& name,
+    const paddle::KernelFunc& kernel_func,
+    const std::vector<std::string>& inputs,
+    const std::vector<std::string>& outputs,
+    const std::vector<std::string>& attrs,
+    const std::unordered_map<std::string, std::string>& inplace_map,
+    void* dso_handle) {
   VLOG(3) << "Custom Operator: op name in kernel: " << name;
   // NOTE [ Dummy Op Kernel Key ]
   // TODO(chenweihang): Because execute engine need get device context based
@@ -701,10 +708,10 @@ static void RegisterOperatorKernel(const std::string& name,
   OperatorWithKernel::OpKernelFunc op_kernel_func;
   if (kernel_func) {
     VLOG(3) << "Register custom operator " << name << " with kernel func";
-    op_kernel_func = [kernel_func, inputs, outputs, attrs](
+    op_kernel_func = [kernel_func, inputs, outputs, attrs, inplace_map](
                          const framework::ExecutionContext& ctx) {
       VLOG(3) << "Custom Operator: run custom kernel func in lambda.";
-      RunKernelFunc(ctx, kernel_func, inputs, outputs, attrs);
+      RunKernelFunc(ctx, kernel_func, inputs, outputs, attrs, inplace_map);
     };
   } else {
     VLOG(3) << "Register custom operator " << name
@@ -760,6 +767,7 @@ void RegisterOperatorWithMetaInfo(const std::vector<OpMetaInfo>& op_meta_infos,
   auto& op_inputs = OpMetaInfoHelper::GetInputs(base_op_meta);
   auto& op_outputs = OpMetaInfoHelper::GetOutputs(base_op_meta);
   auto& op_attrs = OpMetaInfoHelper::GetAttrs(base_op_meta);
+  auto& op_inplace_map = OpMetaInfoHelper::GetInplaceMap(base_op_meta);
   auto& kernel_fn = OpMetaInfoHelper::GetKernelFn(base_op_meta);
   auto& infer_shape_func = OpMetaInfoHelper::GetInferShapeFn(base_op_meta);
   auto& infer_dtype_func = OpMetaInfoHelper::GetInferDtypeFn(base_op_meta);
@@ -771,6 +779,12 @@ void RegisterOperatorWithMetaInfo(const std::vector<OpMetaInfo>& op_meta_infos,
           << string::join_strings(op_outputs, ',');
   VLOG(3) << "Custom Operator: forward, op attrs: "
           << string::join_strings(op_attrs, ',');
+  if (!op_inplace_map.empty()) {
+    VLOG(3) << "Custom Operator: forward, op inplace_map: "
+            << string::join_strings(op_inplace_map, ',', [](auto& pair) {
+                 return pair.first + ": " + pair.second;
+               });
+  }
 
   // Op
   info.creator_ = [](const std::string& op_name,
@@ -795,6 +809,13 @@ void RegisterOperatorWithMetaInfo(const std::vector<OpMetaInfo>& op_meta_infos,
           op_name,
           info.proto_->InitializationErrorString()));
 
+  // Inplace
+  if (!op_inplace_map.empty()) {
+    info.infer_inplace_ = [op_inplace_map](bool use_cuda) {
+      return op_inplace_map;
+    };
+  }
+
   // InferShape
   if (infer_shape_func == nullptr) {
     // use default InferShape
@@ -908,8 +929,13 @@ void RegisterOperatorWithMetaInfo(const std::vector<OpMetaInfo>& op_meta_infos,
   }
 
   // Kernel func
-  RegisterOperatorKernel(
-      op_name, kernel_fn, op_inputs, op_outputs, op_attrs, dso_handle);
+  RegisterOperatorKernel(op_name,
+                         kernel_fn,
+                         op_inputs,
+                         op_outputs,
+                         op_attrs,
+                         op_inplace_map,
+                         dso_handle);
 
   // If grad op or double grad op exists
   std::string cur_op_name = op_name;
@@ -920,6 +946,7 @@ void RegisterOperatorWithMetaInfo(const std::vector<OpMetaInfo>& op_meta_infos,
     auto& grad_op_inputs = OpMetaInfoHelper::GetInputs(cur_grad_op);
     auto& grad_op_outputs = OpMetaInfoHelper::GetOutputs(cur_grad_op);
     auto& grad_op_attrs = OpMetaInfoHelper::GetAttrs(cur_grad_op);
+    auto& grad_op_inplace_map = OpMetaInfoHelper::GetInplaceMap(cur_grad_op);
     auto& grad_kernel_fn = OpMetaInfoHelper::GetKernelFn(cur_grad_op);
     auto& grad_infer_shape_fn = OpMetaInfoHelper::GetInferShapeFn(cur_grad_op);
 
@@ -928,6 +955,14 @@ void RegisterOperatorWithMetaInfo(const std::vector<OpMetaInfo>& op_meta_infos,
             << string::join_strings(grad_op_inputs, ',');
     VLOG(3) << "Custom Operator: backward, op outputs: "
             << string::join_strings(grad_op_outputs, ',');
+    VLOG(3) << "Custom Operator: backward, op attrs: "
+            << string::join_strings(grad_op_attrs, ',');
+    if (!op_inplace_map.empty()) {
+      VLOG(3) << "Custom Operator: backward, op inplace_map: "
+              << string::join_strings(grad_op_inplace_map, ',', [](auto& pair) {
+                   return pair.first + ": " + pair.second;
+                 });
+    }
 
     bool is_double_grad = (i == 2);
 
@@ -1040,6 +1075,7 @@ void RegisterOperatorWithMetaInfo(const std::vector<OpMetaInfo>& op_meta_infos,
                            grad_op_inputs,
                            grad_op_outputs,
                            grad_op_attrs,
+                           grad_op_inplace_map,
                            dso_handle);
 
     // update current info
diff --git a/paddle/fluid/framework/op_meta_info_helper.h b/paddle/fluid/framework/op_meta_info_helper.h
index b93e0ab0f55..20154e1ee38 100644
--- a/paddle/fluid/framework/op_meta_info_helper.h
+++ b/paddle/fluid/framework/op_meta_info_helper.h
@@ -39,6 +39,10 @@ class OpMetaInfoHelper {
       const paddle::OpMetaInfo& info) {
     return info.attrs_;
   }
+  static const std::unordered_map<std::string, std::string>& GetInplaceMap(
+      const paddle::OpMetaInfo& info) {
+    return info.inplace_map_;
+  }
   static const KernelFunc& GetKernelFn(const paddle::OpMetaInfo& info) {
     return info.kernel_fn_;
   }
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index 9a3229f3210..50508d1db5c 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -531,7 +531,18 @@ static PyObject* eager_api_run_custom_op(PyObject* self,
                                   meta_info_map.at(op_type)[0]));
     ctx.EmplaceBackAttrs(res_attrs);
     const auto& vec_map = meta_info_map.at(op_type);
+
+    // handle inplace case
+    const auto& inputs = paddle::framework::OpMetaInfoHelper::GetInputs(
+        meta_info_map.at(op_type)[0]);
+    const auto& outputs = paddle::framework::OpMetaInfoHelper::GetOutputs(
+        meta_info_map.at(op_type)[0]);
+    const auto& inplace_map =
+        paddle::framework::OpMetaInfoHelper::GetInplaceMap(
+            meta_info_map.at(op_type)[0]);
+    ctx.MapPlainOutputs(inputs, outputs, inplace_map);
     (*paddle::framework::OpMetaInfoHelper::GetKernelFn(vec_map[0]))(&ctx);
+    ctx.AssignInplaceOutputs();
 
     VLOG(7) << "Get AutogradMeta for inputs and outputs for Custom Op";
     std::vector<std::vector<egr::AutogradMeta*>> ins_auto_grad_metas;
@@ -557,12 +568,43 @@ static PyObject* eager_api_run_custom_op(PyObject* self,
           require_any_grad || egr::EagerUtils::ComputeRequireGrad(
                                   trace_backward, &(ins_auto_grad_metas[i]));
     }
+
+    // handle inplace case
+    for (size_t i = 0; i < ctx.InputRange().size(); i++) {
+      if (inplace_map.find(inputs[i]) != inplace_map.end()) {
+        size_t input_size =
+            ctx.InputRangeAt(i).second - ctx.InputRangeAt(i).first;
+        size_t start_idx = ctx.InputRangeAt(i).first;
+        for (size_t j = 0; j < input_size; j++) {
+          egr::EagerUtils::CheckInplace(ctx.InputAt(start_idx + j),
+                                        ins_auto_grad_metas[i][j],
+                                        require_any_grad);
+          // Bump Inplace Version
+          ctx.MutableInputAt(start_idx + j).bump_inplace_version();
+          VLOG(3) << "Custom operator: Tensor("
+                  << ctx.InputAt(start_idx + j).name()
+                  << ") uses Inplace Strategy.";
+        }
+      }
+    }
+
     if (require_any_grad && (vec_map.size() > 1)) {
       VLOG(6) << " Construct Grad for Custom Op: " << op_type;
       ConstructFwdAndBwdMap(vec_map, op_type);
       for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) {
         egr::EagerUtils::PassStopGradient(false, &(outs_auto_grad_metas[i]));
       }
+      // Note(HongyuJia): In dygraph eager mode, CheckInplace makes sure leaf
+      // nodes set stop_gradient=True. However, dygraph mode can also outputs
+      // lead nodes' gradients (For example, we can get x.grad after x.add_(y)).
+      // To be consistent with dygraph mode, we have to PassStopGradient for all
+      // inplaced ins_auto_grad_metas.
+      std::unordered_map<size_t, size_t> inplace_tensor_map =
+          ctx.GetInplaceTensorMap();
+      for (auto pair : inplace_tensor_map) {
+        egr::EagerUtils::PassStopGradient(false,
+                                          &(ins_auto_grad_metas[pair.first]));
+      }
       auto grad_node = std::make_shared<egr::RunCustomOpNode>(
           outs_auto_grad_metas.size(), ins_auto_grad_metas.size(), op_type);
       auto slot_map =
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 8637b12f8aa..fc7d359afd7 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -609,8 +609,7 @@ paddle::CustomOpKernelContext CastPyArg2CustomOpKernelContext(PyObject* obj,
     return ::pybind11::handle(obj).cast<paddle::CustomOpKernelContext>();
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "argument (position %d) must be "
-        "one of(Place,CUDAPlace,CPUPlace,XPUPlace,NPUPlace,CUDAPinnedPlace), "
+        "argument (position %d) must be CustomOpKernelContext, "
         "but got %s",
         arg_pos + 1,
         reinterpret_cast<PyTypeObject*>(obj->ob_type)->tp_name));
diff --git a/paddle/phi/api/ext/op_meta_info.h b/paddle/phi/api/ext/op_meta_info.h
index 95eddb97455..77ec8c417da 100644
--- a/paddle/phi/api/ext/op_meta_info.h
+++ b/paddle/phi/api/ext/op_meta_info.h
@@ -108,6 +108,7 @@ class PADDLE_API CustomOpKernelContext {
 
   const Tensor& InputAt(size_t idx) const;
   std::vector<Tensor> InputsBetween(size_t start, size_t end) const;
+  Tensor& MutableInputAt(size_t idx);
   const std::vector<paddle::any>& Attrs() const { return attrs_; }
   const std::vector<std::pair<size_t, size_t>>& InputRange() {
     return input_range_;
@@ -129,11 +130,23 @@ class PADDLE_API CustomOpKernelContext {
     }
   }
 
+  // handle inplace case
+  void MapPlainOutputs(
+      const std::vector<std::string>& inputs,
+      const std::vector<std::string>& outputs,
+      const std::unordered_map<std::string, std::string>& inplace_map);
+  void AssignInplaceOutputs();
+  std::vector<Tensor*>* AllMutablePlainOutput();
+  std::unordered_map<size_t, size_t> GetInplaceTensorMap();
+
  private:
   // TODO(chenweihang): replaced be SmallVector
   std::vector<Tensor> inputs_;
   std::vector<Tensor> outputs_;
   std::vector<paddle::any> attrs_;
+  // handle inplace case
+  std::vector<Tensor*> plain_outputs_;
+  std::unordered_map<size_t, size_t> inplace_tensor_map_;
 
   std::vector<std::pair<size_t, size_t>> input_range_;
   std::vector<std::pair<size_t, size_t>> output_range_;
@@ -148,8 +161,7 @@ using KernelFunc = void (*)(CustomOpKernelContext*);
   template <typename... Tail>                                                  \
   struct ComputeCallHelper<attr_type, Tail...> {                               \
     template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs> \
-    static void Compute(CustomOpKernelContext* ctx,                            \
-                        const PreviousArgs&... pargs) {                        \
+    static void Compute(CustomOpKernelContext* ctx, PreviousArgs&... pargs) {  \
       attr_type arg = ctx->AttrAt<attr_type>(attr_idx);                        \
       ComputeCallHelper<                                                       \
           Tail...>::template Compute<in_idx, attr_idx + 1, out_idx>(ctx,       \
@@ -177,10 +189,9 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
   template <typename... Tail>
   struct ComputeCallHelper<const Tensor&, Tail...> {
     template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
-    static void Compute(CustomOpKernelContext* ctx,
-                        const PreviousArgs&... pargs) {
+    static void Compute(CustomOpKernelContext* ctx, PreviousArgs&... pargs) {
       auto& range = ctx->InputRangeAt(in_idx);
-      auto& arg = ctx->InputAt(range.first);
+      auto& arg = ctx->MutableInputAt(range.first);
       ComputeCallHelper<
           Tail...>::template Compute<in_idx + 1, attr_idx, out_idx>(ctx,
                                                                     pargs...,
@@ -191,8 +202,7 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
   template <typename... Tail>
   struct ComputeCallHelper<const std::vector<Tensor>&, Tail...> {
     template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
-    static void Compute(CustomOpKernelContext* ctx,
-                        const PreviousArgs&... pargs) {
+    static void Compute(CustomOpKernelContext* ctx, PreviousArgs&... pargs) {
       auto& range = ctx->InputRangeAt(in_idx);
       auto arg = ctx->InputsBetween(range.first, range.second);
       ComputeCallHelper<
@@ -232,11 +242,12 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
   PD_SPECIALIZE_ComputeCallHelper(std::vector<int64_t>);
   PD_SPECIALIZE_ComputeCallHelper(std::vector<std::string>);
 
+  // Used to be compatible with 2.3 released internal inplace interface, not
+  // recommended
   template <typename... Tail>
   struct ComputeCallHelper<Tensor*, Tail...> {
     template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
-    static void Compute(CustomOpKernelContext* ctx,
-                        const PreviousArgs&... pargs) {
+    static void Compute(CustomOpKernelContext* ctx, PreviousArgs&... pargs) {
       auto& range = ctx->OutputRangeAt(out_idx);
       auto* arg = ctx->MutableOutputAt(range.first);
       ComputeCallHelper<
@@ -246,13 +257,14 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
     }
   };
 
+  // Used to be compatible with 2.3 released internal inplace interface, not
+  // recommended
   // TODO(chenweihang): What is the appropriate output form?
   // std::vector<Tensor>*? or std::vector<Tensor*>? or std::vector<Tensor*>*
   template <typename... Tail>
   struct ComputeCallHelper<std::vector<Tensor*>, Tail...> {
     template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
-    static void Compute(CustomOpKernelContext* ctx,
-                        const PreviousArgs&... pargs) {
+    static void Compute(CustomOpKernelContext* ctx, PreviousArgs&... pargs) {
       auto& range = ctx->OutputRangeAt(out_idx);
       auto arg = ctx->MutableOutputBetweeen(range.first, range.second);
       ComputeCallHelper<
@@ -262,18 +274,32 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
     }
   };
 
+  // Handle Tensor& for inplace case
+  template <typename... Tail>
+  struct ComputeCallHelper<Tensor&, Tail...> {
+    template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
+    static void Compute(CustomOpKernelContext* ctx, PreviousArgs&... pargs) {
+      auto& range = ctx->InputRangeAt(in_idx);
+      auto& arg = ctx->MutableInputAt(range.first);
+      ComputeCallHelper<
+          Tail...>::template Compute<in_idx + 1, attr_idx, out_idx>(ctx,
+                                                                    pargs...,
+                                                                    arg);
+    }
+  };
+
   template <int out_idx, typename T>
   struct ComputeReturnHelper;
 
   // For compatibility with the original custom op form
   template <int out_idx>
   struct ComputeReturnHelper<out_idx, std::vector<Tensor>> {
-    static void Compute(CustomOpKernelContext* ctx, const Args&... args) {
+    static void Compute(CustomOpKernelContext* ctx, Args&... args) {
       static_assert(out_idx == 0,
                     "If return std::vector<Tensor> in Custom OpKernel, "
                     "you cannot pass output by kernel function argument.");
       auto outs = impl_fn(args...);
-      auto* orig_outs = ctx->AllMutableOutput();
+      auto* orig_outs = ctx->AllMutablePlainOutput();
       PD_CHECK(orig_outs->size() == outs.size(),
                "The number of element in custom operator outputs is wrong, "
                "expected contains ",
@@ -282,15 +308,14 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
                outs.size(),
                " Tensors.");
       for (size_t i = 0; i < outs.size(); ++i) {
-        AssignTensorImpl(outs.at(i), &(orig_outs->at(i)));
+        AssignTensorImpl(outs.at(i), orig_outs->at(i));
       }
     }
   };
 
   template <int out_idx>
   struct ComputeReturnHelper<out_idx, void> {
-    static void Compute(CustomOpKernelContext* ctx, const Args&... args) {
-      static_assert(out_idx > 0, "Custom OpKernel has no output.");
+    static void Compute(CustomOpKernelContext* ctx, Args&... args) {
       impl_fn(args...);
     }
   };
@@ -299,8 +324,7 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
   template <typename T>
   struct ComputeCallHelper<TypeTag<T>> {
     template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
-    static void Compute(CustomOpKernelContext* ctx,
-                        const PreviousArgs&... pargs) {
+    static void Compute(CustomOpKernelContext* ctx, PreviousArgs&... pargs) {
       ComputeReturnHelper<out_idx, Return>::Compute(ctx, pargs...);
     }
   };
@@ -547,9 +571,14 @@ class PADDLE_API OpMetaInfo {
   // format: {"<name1>", "<name2>", ...}
   OpMetaInfo& Outputs(std::vector<std::string>&& outputs);
 
-  // format: {"<name1>:<type1>", "<name1>:<type1>", ...}
+  // format: {"<name1>:<type1>", "<name2>:<type2>", ...}
   OpMetaInfo& Attrs(std::vector<std::string>&& attrs);
 
+  // format: {"<input_name1>:<output_name1>",
+  // "<input_name2>:<output_name2>",...}
+  OpMetaInfo& Inplace(
+      std::unordered_map<std::string, std::string>&& inplace_map);
+
   // format: PD_KERNEL(...)
   OpMetaInfo& SetKernelFn(KernelFunc&& func);
 
@@ -567,6 +596,7 @@ class PADDLE_API OpMetaInfo {
   std::vector<std::string> inputs_;
   std::vector<std::string> outputs_;
   std::vector<std::string> attrs_;
+  std::unordered_map<std::string, std::string> inplace_map_;
   // 2. func info
   KernelFunc kernel_fn_{nullptr};
   InferShapeFunc infer_shape_fn_{nullptr};
@@ -605,6 +635,8 @@ class PADDLE_API OpMetaInfoBuilder {
   OpMetaInfoBuilder& Inputs(std::vector<std::string>&& inputs);
   OpMetaInfoBuilder& Outputs(std::vector<std::string>&& outputs);
   OpMetaInfoBuilder& Attrs(std::vector<std::string>&& attrs);
+  OpMetaInfoBuilder& Inplace(
+      std::unordered_map<std::string, std::string>&& inplace_map);
   OpMetaInfoBuilder& SetKernelFn(KernelFunc func);
   OpMetaInfoBuilder& SetInferShapeFn(InferShapeFunc func);
   OpMetaInfoBuilder& SetInferDtypeFn(InferDtypeFunc func);
diff --git a/paddle/phi/api/lib/op_meta_info.cc b/paddle/phi/api/lib/op_meta_info.cc
index a6b7921c30c..487308ea568 100644
--- a/paddle/phi/api/lib/op_meta_info.cc
+++ b/paddle/phi/api/lib/op_meta_info.cc
@@ -94,6 +94,10 @@ std::vector<Tensor> CustomOpKernelContext::InputsBetween(size_t start,
   return rlt;
 }
 
+Tensor& CustomOpKernelContext::MutableInputAt(size_t idx) {
+  return inputs_.at(idx);
+}
+
 Tensor* CustomOpKernelContext::MutableOutputAt(size_t idx) {
   return &(outputs_.at(idx));
 }
@@ -128,6 +132,71 @@ const std::pair<size_t, size_t>& CustomOpKernelContext::OutputRangeAt(
   return output_range_.at(idx);
 }
 
+// handle inplace mechanism
+// Find out non-inplace output tensors.
+void CustomOpKernelContext::MapPlainOutputs(
+    const std::vector<std::string>& inputs,
+    const std::vector<std::string>& outputs,
+    const std::unordered_map<std::string, std::string>& inplace_map) {
+  for (size_t in_idx = 0; in_idx < inputs.size(); ++in_idx) {
+    auto& input = inputs[in_idx];
+    if (inplace_map.find(input) == inplace_map.end()) {
+      continue;
+    }
+    auto out_iter = find(outputs.begin(), outputs.end(), inplace_map.at(input));
+    PADDLE_ENFORCE(
+        out_iter != outputs.end(),
+        phi::errors::NotFound("Can't find the mapped value of %s, please check "
+                              "the input of `Inplace` again and make "
+                              "sure you registered your op accurately. ",
+                              input));
+    inplace_tensor_map_[in_idx] = distance(outputs.begin(), out_iter);
+  }
+  for (size_t i = 0; i < outputs.size(); ++i) {
+    if (std::any_of(
+            inplace_tensor_map_.begin(),
+            inplace_tensor_map_.end(),
+            [i](std::unordered_map<size_t, size_t>::const_reference pair) {
+              return pair.second == i;
+            })) {
+      continue;
+    }
+    size_t output_start_idx = output_range_[i].first;
+    size_t output_end_idx = output_range_[i].second;
+    for (size_t idx = output_start_idx; idx < output_end_idx; ++idx) {
+      plain_outputs_.push_back(&outputs_[idx]);
+    }
+  }
+  VLOG(4) << "Custom opertor update inplace input-output map successfully.";
+}
+// Assign input tensor to inplace output tensors.
+void CustomOpKernelContext::AssignInplaceOutputs() {
+  for (auto pair : inplace_tensor_map_) {
+    size_t in_start_idx = input_range_[pair.first].first;
+    size_t in_end_idx = input_range_[pair.first].second;
+    size_t out_start_idx = output_range_[pair.second].first;
+    size_t out_end_idx = output_range_[pair.second].second;
+    size_t assign_tensor_size = in_end_idx - in_start_idx;
+    PADDLE_ENFORCE(
+        assign_tensor_size == out_end_idx - out_start_idx,
+        phi::errors::OutOfRange("When assigning inplaced tensor, Input vector "
+                                "size %d mismatch output vector size %d",
+                                in_end_idx - in_start_idx,
+                                out_end_idx - out_start_idx));
+    for (size_t i = 0; i < assign_tensor_size; ++i) {
+      AssignTensorImpl(inputs_[in_start_idx + i], &outputs_[out_start_idx + i]);
+    }
+    VLOG(4)
+        << "Custom opertor update inplace input-output tensor successfully.";
+  }
+}
+std::vector<Tensor*>* CustomOpKernelContext::AllMutablePlainOutput() {
+  return &plain_outputs_;
+}
+std::unordered_map<size_t, size_t>
+CustomOpKernelContext::GetInplaceTensorMap() {
+  return inplace_tensor_map_;
+}
 ////////////////////// Op Meta Info //////////////////////
 
 OpMetaInfo& OpMetaInfo::Inputs(std::vector<std::string>&& inputs) {
@@ -142,6 +211,12 @@ OpMetaInfo& OpMetaInfo::Attrs(std::vector<std::string>&& attrs) {
   attrs_ = std::forward<std::vector<std::string>>(attrs);
   return *this;
 }
+OpMetaInfo& OpMetaInfo::Inplace(
+    std::unordered_map<std::string, std::string>&& inplace_map) {
+  inplace_map_ =
+      std::forward<std::unordered_map<std::string, std::string>>(inplace_map);
+  return *this;
+}
 OpMetaInfo& OpMetaInfo::SetKernelFn(KernelFunc&& func) {
   kernel_fn_ = std::forward<KernelFunc>(func);
   return *this;
@@ -222,6 +297,13 @@ OpMetaInfoBuilder& OpMetaInfoBuilder::Attrs(std::vector<std::string>&& attrs) {
   return *this;
 }
 
+OpMetaInfoBuilder& OpMetaInfoBuilder::Inplace(
+    std::unordered_map<std::string, std::string>&& inplace_map) {
+  info_ptr_->Inplace(
+      std::forward<std::unordered_map<std::string, std::string>>(inplace_map));
+  return *this;
+}
+
 OpMetaInfoBuilder& OpMetaInfoBuilder::SetKernelFn(KernelFunc func) {
   info_ptr_->SetKernelFn(std::forward<KernelFunc>(func));
   return *this;
diff --git a/python/paddle/fluid/tests/custom_op/CMakeLists.txt b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
index 17dc7468853..7fc26aed21d 100644
--- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
@@ -50,6 +50,7 @@ py_test(test_custom_conj SRCS test_custom_conj.py)
 py_test(test_custom_linear SRCS test_custom_linear.py)
 py_test(test_custom_simple_slice SRCS test_custom_simple_slice.py)
 py_test(test_custom_tanh_double_grad SRCS test_custom_tanh_double_grad.py)
+py_test(test_custom_inplace SRCS test_custom_inplace.py)
 
 # other tests
 py_test(test_sysconfig SRCS test_sysconfig.py)
diff --git a/python/paddle/fluid/tests/custom_op/custom_inplace.cc b/python/paddle/fluid/tests/custom_op/custom_inplace.cc
new file mode 100644
index 00000000000..7b57a632ca6
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/custom_inplace.cc
@@ -0,0 +1,136 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WIdata_tHOUdata_t WARRANdata_tIES OR CONDIdata_tIONS OF ANY KIND, either
+// express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <vector>
+
+#include "paddle/extension.h"
+
+template <typename data_t>
+void add_forward_kernel(data_t* x_data, const data_t* y_data, int64_t numel) {
+  for (size_t i = 0; i < numel; ++i) {
+    x_data[i] += y_data[i];
+  }
+}
+
+template <typename data_t>
+void add_backward_kernel(data_t* y_grad_data,
+                         const data_t* out_grad_data,
+                         int64_t numel) {
+  for (size_t i = 0; i < numel; ++i) {
+    y_grad_data[i] = out_grad_data[i];
+  }
+}
+
+template <typename data_t>
+void relu_forward_kernel(data_t* x_data, int64_t numel) {
+  for (size_t i = 0; i < numel; ++i) {
+    x_data[i] = x_data[i] > 0 ? x_data[i] : 0;
+  }
+}
+
+template <typename data_t>
+void relu_backward_kernel(const data_t* out_data,
+                          data_t* grad_out_data,
+                          int64_t out_numel) {
+  for (int64_t i = 0; i < out_numel; ++i) {
+    grad_out_data[i] =
+        grad_out_data[i] * (out_data[i] > static_cast<data_t>(0) ? 1. : 0.);
+  }
+}
+
+void AddForward(paddle::Tensor& x, const paddle::Tensor& y) {  // NOLINT
+  PD_CHECK(x.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
+
+  PD_DISPATCH_FLOATING_TYPES(x.type(), "AddForward", ([&] {
+                               add_forward_kernel<data_t>(x.data<data_t>(),
+                                                          y.data<data_t>(),
+                                                          x.size());
+                             }));
+}
+
+std::vector<paddle::DataType> AddInferDtype(const paddle::DataType& x_dtype,
+                                            const paddle::DataType& y_dtype) {
+  return {x_dtype};
+}
+
+std::vector<std::vector<int64_t>> AddInferShape(
+    const std::vector<int64_t>& x_shape, const std::vector<int64_t>& y_shape) {
+  return {x_shape};
+}
+
+std::vector<paddle::Tensor> AddBackward(const paddle::Tensor& x,
+                                        const paddle::Tensor& y,
+                                        paddle::Tensor& out_grad) {  // NOLINT
+  PD_CHECK(x.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
+  PD_CHECK(y.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
+
+  paddle::Tensor y_grad = paddle::empty(x.shape(), x.dtype(), x.place());
+
+  PD_DISPATCH_FLOATING_TYPES(
+      out_grad.type(), "AddBackward", ([&] {
+        add_backward_kernel<data_t>(
+            y_grad.data<data_t>(), out_grad.data<data_t>(), out_grad.size());
+      }));
+
+  return {y_grad};
+}
+
+PD_BUILD_OP(custom_add)
+    .Inputs({"X", "Y"})
+    .Outputs({"Out"})
+    .Inplace({{"X", "Out"}})
+    .SetKernelFn(PD_KERNEL(AddForward))
+    .SetInferShapeFn(PD_INFER_SHAPE(AddInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(AddInferDtype));
+
+PD_BUILD_GRAD_OP(custom_add)
+    .Inputs({"X", "Y", paddle::Grad("Out")})
+    .Outputs({paddle::Grad("X"), paddle::Grad("Y")})
+    .Inplace({{paddle::Grad("Out"), paddle::Grad("X")}})
+    .SetKernelFn(PD_KERNEL(AddBackward));
+
+void ReluForwardInplace(paddle::Tensor& x) {  // NOLINT
+  PD_CHECK(x.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
+
+  PD_DISPATCH_FLOATING_TYPES(x.type(), "ReluForward", ([&] {
+                               relu_forward_kernel<data_t>(x.data<data_t>(),
+                                                           x.size());
+                             }));
+}
+
+void ReluBackwardInplace(const paddle::Tensor& x,
+                         const paddle::Tensor& out,
+                         paddle::Tensor& grad_out) {  // NOLINT
+  PD_CHECK(out.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
+
+  PD_DISPATCH_FLOATING_TYPES(
+      grad_out.type(), "ReluBackward", ([&] {
+        relu_backward_kernel<data_t>(
+            out.data<data_t>(), grad_out.data<data_t>(), grad_out.size());
+      }));
+}
+
+PD_BUILD_OP(custom_relu_inplace)
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .Inplace({{"X", "Out"}})
+    .SetKernelFn(PD_KERNEL(ReluForwardInplace));
+
+PD_BUILD_GRAD_OP(custom_relu_inplace)
+    .Inputs({"X", "Out", paddle::Grad("Out")})
+    .Outputs({paddle::Grad("X")})
+    .Inplace({{paddle::Grad("Out"), paddle::Grad("X")}})
+    .SetKernelFn(PD_KERNEL(ReluBackwardInplace));
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_inplace.py b/python/paddle/fluid/tests/custom_op/test_custom_inplace.py
new file mode 100644
index 00000000000..d3a89594101
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/test_custom_inplace.py
@@ -0,0 +1,333 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+import numpy as np
+from utils import extra_cc_args, extra_nvcc_args, paddle_includes
+
+import paddle
+import paddle.static as static
+from paddle.utils.cpp_extension import get_build_directory, load
+from paddle.utils.cpp_extension.extension_utils import run_cmd
+
+# Because Windows don't use docker, the shared lib already exists in the
+# cache dir, it will not be compiled again unless the shared lib is removed.
+file = '{}\\custom_inplace\\custom_inplace.pyd'.format(get_build_directory())
+if os.name == 'nt' and os.path.isfile(file):
+    cmd = 'del {}'.format(file)
+    run_cmd(cmd, True)
+
+# Compile and load custom op Just-In-Time.
+custom_inplace = load(
+    name='custom_inplace',
+    sources=['custom_inplace.cc'],
+    extra_include_paths=paddle_includes,  # add for Coverage CI
+    extra_cxx_cflags=extra_cc_args,  # test for cflags
+    extra_cuda_cflags=extra_nvcc_args,  # test for cflags
+    verbose=True,
+)
+
+
+def inplace_dynamic_add(phi_func, device, dtype, np_x, np_y):
+    paddle.set_device(device)
+    x = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=True)
+    y = paddle.to_tensor(np_y, dtype=dtype, stop_gradient=False)
+    if phi_func:
+        out = custom_inplace.custom_add(x, y)
+    else:
+        out = x.add_(y)
+
+    out.backward()
+    return x.numpy(), y.numpy(), out.numpy(), x.grad.numpy(), y.grad.numpy()
+
+
+def inplace_static_add(func, device, dtype, np_x, np_y):
+    paddle.enable_static()
+    paddle.set_device(device)
+    with static.scope_guard(static.Scope()):
+        with static.program_guard(static.Program()):
+            x = static.data(name="x", shape=[None, np_x.shape[1]], dtype=dtype)
+            y = static.data(name="y", shape=[None, np_y.shape[1]], dtype=dtype)
+            x.stop_gradient = False
+            y.stop_gradient = False
+            out = func(x, y)
+            mean_out = paddle.mean(out)
+            static.append_backward(mean_out)
+
+            exe = static.Executor()
+            exe.run(static.default_startup_program())
+
+            x_v, out_v, x_grad_v, y_grad_v, out_grad_v = exe.run(
+                static.default_main_program(),
+                feed={
+                    "x": np_x.astype(dtype),
+                    "y": np_y.astype(dtype),
+                },
+                fetch_list=[
+                    x.name,
+                    out.name,
+                    x.name + "@GRAD",
+                    y.name + "@GRAD",
+                    out.name + "@GRAD",
+                ],
+            )
+    paddle.disable_static()
+    return x_v, out_v, x_grad_v, y_grad_v, out_grad_v
+
+
+def inplace_dynamic_relu(phi_func, device, dtype, np_x, np_y, np_z):
+    paddle.set_device(device)
+    x = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False)
+    y = paddle.to_tensor(np_y, dtype=dtype, stop_gradient=False)
+    z = paddle.to_tensor(np_z, dtype=dtype, stop_gradient=False)
+    out_xy = x + y
+    if phi_func:
+        out_xy = custom_inplace.custom_relu_inplace(out_xy)
+        out_xyz = out_xy + z
+        out = custom_inplace.custom_relu_inplace(out_xyz)
+    else:
+        out_xy = paddle.nn.functional.relu_(out_xy)
+        out_xyz = out_xy + z
+        out = paddle.nn.functional.relu_(out_xyz)
+
+    out.backward()
+    return x.numpy(), y.numpy(), out.numpy(), x.grad.numpy(), y.grad.numpy()
+
+
+def inplace_static_relu(func, device, dtype, np_x, np_y, np_z):
+    paddle.enable_static()
+    paddle.set_device(device)
+    with static.scope_guard(static.Scope()):
+        with static.program_guard(static.Program()):
+            x = static.data(name="x", shape=[None, np_x.shape[1]], dtype=dtype)
+            y = static.data(name="y", shape=[None, np_y.shape[1]], dtype=dtype)
+            z = static.data(name="z", shape=[None, np_z.shape[1]], dtype=dtype)
+            x.stop_gradient = False
+            y.stop_gradient = False
+            z.stop_gradient = False
+            out_xy = x + y
+            out_xy = func(out_xy)
+            out_xyz = out_xy + z
+            out = func(out_xyz)
+            mean_out = paddle.mean(out)
+            static.append_backward(mean_out)
+
+            exe = static.Executor()
+            exe.run(static.default_startup_program())
+
+            x_v, y_v, out_v, x_grad_v, y_grad_v = exe.run(
+                static.default_main_program(),
+                feed={
+                    "x": np_x.astype(dtype),
+                    "y": np_y.astype(dtype),
+                    "z": np_z.astype(dtype),
+                },
+                fetch_list=[
+                    x.name,
+                    y.name,
+                    out.name,
+                    x.name + "@GRAD",
+                    y.name + "@GRAD",
+                ],
+            )
+    paddle.disable_static()
+    return x_v, y_v, out_v, x_grad_v, y_grad_v
+
+
+class TestCustomInplaceJit(unittest.TestCase):
+    def setUp(self):
+        self.dtypes = ['float32', 'float64']
+        self.devices = ['cpu']
+        self.np_x = np.random.random((3, 2)).astype("float32")
+        self.np_y = np.random.random((3, 2)).astype("float32")
+        self.np_z = np.random.random((3, 2)).astype("float32")
+
+    def check_output(self, out, pd_out, name):
+        np.testing.assert_array_equal(
+            out,
+            pd_out,
+            err_msg='custom op {}: {},\n paddle api {}: {}'.format(
+                name, out, name, pd_out
+            ),
+        )
+
+    def check_output_allclose(self, out, pd_out, name):
+        np.testing.assert_allclose(
+            out,
+            pd_out,
+            rtol=5e-5,
+            atol=1e-2,
+            err_msg='custom op {}: {},\n paddle api {}: {}'.format(
+                name, out, name, pd_out
+            ),
+        )
+
+    def test_static_add(self):
+        for device in self.devices:
+            for dtype in self.dtypes:
+                (
+                    pd_x,
+                    pd_out,
+                    pd_x_grad,
+                    pd_y_grad,
+                    pd_out_grad,
+                ) = inplace_static_add(
+                    paddle.add,
+                    device,
+                    dtype,
+                    self.np_x,
+                    self.np_y,
+                )
+                (
+                    phi_x,
+                    phi_out,
+                    phi_x_grad,
+                    phi_y_grad,
+                    phi_out_grad,
+                ) = inplace_static_add(
+                    custom_inplace.custom_add,
+                    device,
+                    dtype,
+                    self.np_x,
+                    self.np_y,
+                )
+                self.check_output(phi_x, phi_out, "inplace_phi_x")
+                self.check_output(
+                    phi_x_grad, phi_out_grad, "inplace_phi_x_grad"
+                )
+
+                self.check_output(phi_out, pd_out, "out")
+                self.check_output(phi_x_grad, pd_x_grad, "x_grad")
+                self.check_output(phi_y_grad, pd_y_grad, "y_grad")
+                self.check_output(phi_out_grad, pd_out_grad, "out_grad")
+
+    def test_dynamic_add(self):
+        for device in self.devices:
+            for dtype in self.dtypes:
+                (
+                    pd_x,
+                    pd_y,
+                    pd_out,
+                    pd_x_grad,
+                    pd_y_grad,
+                ) = inplace_dynamic_add(
+                    False,
+                    device,
+                    dtype,
+                    self.np_x,
+                    self.np_y,
+                )
+                (
+                    phi_x,
+                    phi_y,
+                    phi_out,
+                    phi_x_grad,
+                    phi_y_grad,
+                ) = inplace_dynamic_add(
+                    True,
+                    device,
+                    dtype,
+                    self.np_x,
+                    self.np_y,
+                )
+
+                self.check_output(phi_x, phi_out, "inplace_phi_x")
+                self.check_output(pd_x, pd_out, "inplace_pd_x")
+
+                self.check_output(phi_x, pd_x, "x")
+                self.check_output(phi_y, pd_y, "y")
+                self.check_output(phi_out, pd_out, "out")
+                self.check_output(phi_x_grad, pd_x_grad, "x_grad")
+                self.check_output(phi_y_grad, pd_y_grad, "y_grad")
+
+    def test_static_multiple_inplace_relu(self):
+        for device in self.devices:
+            for dtype in self.dtypes:
+                (
+                    pd_x,
+                    pd_y,
+                    pd_out,
+                    pd_x_grad,
+                    pd_y_grad,
+                ) = inplace_static_relu(
+                    paddle.nn.functional.relu,
+                    device,
+                    dtype,
+                    self.np_x,
+                    self.np_y,
+                    self.np_z,
+                )
+                (
+                    phi_x,
+                    phi_y,
+                    phi_out,
+                    phi_x_grad,
+                    phi_y_grad,
+                ) = inplace_static_relu(
+                    custom_inplace.custom_relu_inplace,
+                    device,
+                    dtype,
+                    self.np_x,
+                    self.np_y,
+                    self.np_z,
+                )
+                self.check_output_allclose(phi_x, pd_x, "x")
+                self.check_output_allclose(phi_y, pd_y, "y")
+                self.check_output_allclose(phi_out, pd_out, "out")
+                self.check_output_allclose(phi_x_grad, pd_x_grad, "x_grad")
+                self.check_output_allclose(phi_y_grad, pd_y_grad, "y_grad")
+
+    def test_dynamic_multiple_inplace_relu(self):
+        for device in self.devices:
+            for dtype in self.dtypes:
+                (
+                    pd_x,
+                    pd_y,
+                    pd_out,
+                    pd_x_grad,
+                    pd_y_grad,
+                ) = inplace_dynamic_relu(
+                    False,
+                    device,
+                    dtype,
+                    self.np_x,
+                    self.np_y,
+                    self.np_z,
+                )
+                (
+                    phi_x,
+                    phi_y,
+                    phi_out,
+                    phi_x_grad,
+                    phi_y_grad,
+                ) = inplace_dynamic_relu(
+                    True,
+                    device,
+                    dtype,
+                    self.np_x,
+                    self.np_y,
+                    self.np_z,
+                )
+
+                self.check_output(phi_x, pd_x, "x")
+                self.check_output(phi_y, pd_y, "y")
+                self.check_output(phi_out, pd_out, "out")
+                self.check_output(phi_x_grad, pd_x_grad, "x_grad")
+                self.check_output(phi_y_grad, pd_y_grad, "y_grad")
+
+
+if __name__ == "__main__":
+    unittest.main()
-- 
GitLab