[CustomOP Inplace] Automap inplace dtype and shape, support vector<Tensor> output (#52114)

* [CustomOP Inplace] Automap inplace dtype and shape, prepare for vector<Tensor> output * delete dtype,shape func of multi_inplace op * [CustomOP Inplace] Automap inplace dtype and shape, support vector<Tensor> output

[CustomOP Inplace] Automap inplace dtype and shape, support vector<Tensor> output (#52114)
* [CustomOP Inplace] Automap inplace dtype and shape, prepare for vector<Tensor> output * delete dtype,shape func of multi_inplace op * [CustomOP Inplace] Automap inplace dtype and shape, support vector<Tensor> output
04025237 · HongyuJia · GitHub · 888a30c9 · 04025237 · 04025237
6 changed file
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -268,15 +268,15 @@ static void RunKernelFunc(
  for (size_t i = 0; i < outputs.size(); ++i) {
    auto out_name = outputs[i];
    if (detail::IsDuplicableVar(out_name)) {
-      PADDLE_ENFORCE(i == 0UL && outputs.size() == 1UL,
-                     platform::errors::PreconditionNotMet(
-                         "If custom operator's outputs contains `paddle::Vec("
-                         ")` type, "
-                         "it only can hold one output."));
+      PADDLE_ENFORCE(
+          !inplace_map.empty() || (i == 0UL && outputs.size() == 1UL),
+          phi::errors::PreconditionNotMet(
+              "If custom operator's outputs contains `paddle::Vec()` type "
+              "without setting InplaceMap, it only can hold one output."));
      auto vec_out = ctx.MultiOutput<phi::DenseTensor>(out_name);
      PADDLE_ENFORCE_NE(vec_out.empty(),
                        true,
-                        platform::errors::NotFound(
+                        phi::errors::NotFound(
                            "Output vector<tensor> (%s) is empty.", out_name));
      std::vector<paddle::Tensor> custom_vec_out;
      for (size_t j = 0; j < vec_out.size(); ++j) {
@@ -359,11 +359,67 @@ static void RunKernelFunc(
  }
 }

-static void RunInferShapeFunc(framework::InferShapeContext* ctx,
+static void RunDefaultInferShapeFunc(
+    framework::InferShapeContext* ctx,
+    const std::vector<std::string>& inputs,
+    const std::vector<std::string>& outputs,
+    const std::unordered_map<std::string, std::string>& inplace_map) {
+  if (inplace_map.empty()) {  // general case, assure single input and output
+    PADDLE_ENFORCE_EQ(
+        inputs.size(),
+        1UL,
+        phi::errors::Unavailable(
+            "Your custom operator contains multiple inputs. "
+            "We only allow a custom operator that contains only one input "
+            "and only one output without setting the InferShapeFn. "
+            "At this time, the input shape will be directly set to "
+            "the output shape.\n"
+            "Please set the InferShapeFn of custom "
+            "operator by .SetInferShapeFn(PD_INFER_SHAPE(...))"));
+    PADDLE_ENFORCE_EQ(
+        outputs.size(),
+        1UL,
+        phi::errors::Unavailable(
+            "Your custom operator contains multiple outputs. "
+            "We only allow a custom operator that contains only one input "
+            "and only one output without setting the InferShapeFn. "
+            "At this time, the input shape will be directly set to "
+            "the output shape.\n"
+            "Please set the InferShapeFn of custom "
+            "operator by .SetInferShapeFn(PD_INFER_SHAPE(...))"));
+
+    VLOG(3) << "Custom Operator: Default InferShape - share ddim.";
+    ctx->ShareDim(inputs[0], outputs[0]);
+  } else {  // inplace case
+    PADDLE_ENFORCE_EQ(
+        inplace_map.size(),
+        outputs.size(),
+        phi::errors::Unavailable(
+            "Your custom operator uses `SetInplaceMap` without setting the "
+            "InferShapeFn. However, `Outputs` size = %d does not match the "
+            "`InplaceMap` size = %d. Please check `SetInplaceMap` again or set "
+            "the InferShapeFn of custom operator by "
+            "`.SetInferShapeFn(PD_INFER_SHAPE(...)`)",
+            outputs.size(),
+            inplace_map.size()));
+    for (auto const& pair : inplace_map) {
+      if (detail::IsDuplicableVar(pair.first)) {
+        ctx->SetOutputsDim(pair.second, ctx->GetInputsDim(pair.first));
+      } else {
+        ctx->ShareDim(pair.first, pair.second);
+      }
+    }
+  }
+}
+
+static void RunInferShapeFunc(
+    framework::InferShapeContext* ctx,
    const paddle::InferShapeFunc& func,
    const std::vector<std::string>& inputs,
    const std::vector<std::string>& outputs,
-                              const std::vector<std::string>& attrs) {
+    const std::vector<std::string>& attrs,
+    const std::unordered_map<std::string, std::string>& inplace_map,
+    const std::unordered_map<std::string, std::string>& inplace_reverse_map) {
  std::vector<std::vector<int64_t>> input_shapes;
  std::vector<std::vector<std::vector<int64_t>>> vec_input_shapes;

@@ -450,22 +506,220 @@ static void RunInferShapeFunc(framework::InferShapeContext* ctx,

  VLOG(3) << "Custom Operator: InferShape - calc output ddim.";
  auto output_shapes = func(input_shapes, vec_input_shapes, custom_attrs);
+  if (inplace_map.empty()) {
+    PADDLE_ENFORCE_EQ(outputs.size(),
+                      output_shapes.size(),
+                      phi::errors::InvalidArgument(
+                          "Your custom operator has set the InferShapeFn. "
+                          "However, `Outputs` size = %d does not match the "
+                          "returned vector size of InferShapeFn = %d. Please "
+                          "check InferShapeFn again.",
+                          outputs.size(),
+                          output_shapes.size()));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        outputs.size(),
+        output_shapes.size() + inplace_map.size(),
+        phi::errors::InvalidArgument(
+            "Your custom operator uses `SetInplaceMap` and sets the "
+            "InferShapeFn. However, `Outputs` size = %d does not match the "
+            "`InplaceMap size + InferShapeFn output size` = %d. Please check "
+            "InplaceMap and InferShapeFn again",
+            outputs.size(),
+            output_shapes.size() + inplace_map.size()));
+  }
+
+  VLOG(3)
+      << "Custom Operator: InferShape - set output ddim: inplace_map.size() = "
+      << inplace_map.size()
+      << ", output_shapes.size() = " << output_shapes.size();
+  size_t output_shape_idx = 0;
+  for (size_t i = 0; i < outputs.size(); ++i) {
+    auto out_name = outputs[i];
+    if (detail::IsDuplicableVar(out_name)) {
+      PADDLE_ENFORCE(
+          inplace_reverse_map.find(out_name) != inplace_reverse_map.end(),
+          phi::errors::InvalidArgument(
+              "Custom operator only supports `paddle::Vec(...)` inputs and "
+              "cannot support `paddle::Vec(...)` output without setting "
+              "InplaceMap. If you have to use `paddle::Vec(...)` output, "
+              "please indicate it by setting InplaceMap manully."));
+      auto in_name = inplace_reverse_map.at(out_name);
+      ctx->SetOutputsDim(out_name, ctx->GetInputsDim(in_name));
+    } else {
+      if (inplace_reverse_map.find(out_name) != inplace_reverse_map.end()) {
+        // Share dims between inplace inputs and outputs
+        ctx->ShareDim(inplace_reverse_map.at(out_name), out_name);
+      } else {
+        // Set output dims by the output of InferShapeFn
+        ctx->SetOutputDim(out_name,
+                          phi::make_ddim(output_shapes[output_shape_idx++]));
+      }
+    }
+  }
+}
+
+static void RunDefaultInferDtypeFunc(
+    framework::InferVarTypeContext* ctx,
+    const std::vector<std::string>& inputs,
+    const std::vector<std::string>& outputs,
+    const std::unordered_map<std::string, std::string>& inplace_map) {
+  if (inplace_map.empty()) {  // general case, assure single input and output
+    PADDLE_ENFORCE_EQ(
+        inputs.size(),
+        1UL,
+        platform::errors::Unavailable(
+            "Your custom operator contains multiple inputs. "
+            "We only allow a custom operator that contains only one input "
+            "and only one output without setting the InferDtypeFn. "
+            "At this time, the input dtype will be directly set to "
+            "the output dtype.\n"
+            "Please set the InferDtypeFn of custom "
+            "operator by `.SetInferDtypeFn(PD_INFER_DTYPE(...))`"));
+    PADDLE_ENFORCE_EQ(
+        outputs.size(),
+        1UL,
+        platform::errors::Unavailable(
+            "Your custom operator contains multiple outputs. "
+            "We only allow a custom operator that contains only one input "
+            "and only one output without setting the InferDtypeFn. "
+            "At this time, the input dtype will be directly set to "
+            "the output dtype.\n"
+            "Please set the InferDtypeFn of custom "
+            "operator by `.SetInferDtypeFn(PD_INFER_DTYPE(...))`"));
+
+    VLOG(3) << "Custom Operator: InferDtype - share dtype.";
+    auto dtype = ctx->GetInputDataType(inputs[0]);
+    ctx->SetOutputDataType(outputs[0], dtype);
+  } else {  // inplace case
+    PADDLE_ENFORCE_EQ(
+        inplace_map.size(),
+        outputs.size(),
+        phi::errors::Unavailable(
+            "Your custom operator uses `SetInplaceMap` without setting the "
+            "InferDtypeFn. However, `Outputs` size = %d does not match the "
+            "`InplaceMap` size = %d. Please check `SetInplaceMap` again or set "
+            "the InferDtypeFn of custom operator by "
+            "`.SetInferDtypeFn(PD_INFER_DTYPE(...))`",
+            outputs.size(),
+            inplace_map.size()));
+    for (auto const& pair : inplace_map) {
+      VLOG(3) << "Custom Operator: InferDtype - inplace dtype: " << pair.first
+              << "->" << pair.second;
+      if (detail::IsDuplicableVar(pair.first)) {
+        size_t size = ctx->InputSize(pair.first);
+        for (size_t i = 0; i < size; ++i) {
+          auto dtype = ctx->GetInputDataType(pair.first, i);
+          ctx->SetOutputDataType(pair.second, dtype, i);
+        }
+      } else {
+        auto dtype = ctx->GetInputDataType(pair.first);
+        ctx->SetOutputDataType(pair.second, dtype);
+      }
+    }
+  }
+}

-  VLOG(3) << "Custom Operator: InferShape - set output ddim.";
+static void RunInferDtypeFunc(
+    framework::InferVarTypeContext* ctx,
+    const paddle::InferDtypeFunc& func,
+    const std::vector<std::string>& inputs,
+    const std::vector<std::string>& outputs,
+    const std::unordered_map<std::string, std::string>& inplace_map,
+    const std::unordered_map<std::string, std::string>& inplace_reverse_map) {
+  std::vector<DataType> input_dtypes;
+  std::vector<std::vector<DataType>> vec_input_dtypes;
+
+  VLOG(3) << "Custom Operator: InferDtype - get input dtype.";
+  for (auto& in_name : inputs) {
+    if (detail::IsDuplicableVar(in_name)) {
+      std::vector<DataType> vec_custom_dtype;
+      if (ctx->HasInput(in_name)) {  // general inputs
+        for (size_t i = 0; i < ctx->InputSize(in_name); ++i) {
+          auto dtype = ctx->GetInputDataType(in_name, i);
+          vec_custom_dtype.emplace_back(
+              paddle::framework::TransToPhiDataType(dtype));
+        }
+      } else {  // optional inputs, `vec_custom_dtype` is empty
+        PADDLE_ENFORCE(
+            detail::IsOptionalVar(in_name),
+            phi::errors::NotFound("Your custom operator's InferDtypeFn "
+                                  "cannot find input parameter `%s`",
+                                  in_name));
+        VLOG(3) << "Custom Operator: InferDtypeFn's vector input " << in_name
+                << " is optional dtype with None input";
+      }
+      vec_input_dtypes.emplace_back(vec_custom_dtype);
+    } else {
+      if (ctx->HasInput(in_name)) {  // general inputs
+        auto dtype = ctx->GetInputDataType(in_name);
+        input_dtypes.emplace_back(paddle::framework::TransToPhiDataType(dtype));
+      } else {  // optional inputs
+        PADDLE_ENFORCE(
+            detail::IsOptionalVar(in_name),
+            phi::errors::NotFound("Your custom operator's InferDtypeFn "
+                                  "cannot find input parameter `%s`",
+                                  in_name));
+        input_dtypes.emplace_back(DataType::UNDEFINED);
+        VLOG(3) << "Custom Operator: InferDtypeFn's input " << in_name
+                << " is optional dtype with None input";
+      }
+    }
+  }
+
+  VLOG(3) << "Custom Operator: InferDtype - infer output dtype.";
+  auto output_dtypes = func(input_dtypes, vec_input_dtypes);
+  if (inplace_map.empty()) {
+    PADDLE_ENFORCE_EQ(outputs.size(),
+                      output_dtypes.size(),
+                      phi::errors::InvalidArgument(
+                          "Your custom operator has set the InferDtypeFn. "
+                          "However, `Outputs` size = %d does not match the "
+                          "returned vector size of InferDtypeFn = %d. Please "
+                          "check InferDtypeFn again.",
+                          outputs.size(),
+                          output_dtypes.size()));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        outputs.size(),
+        output_dtypes.size() + inplace_map.size(),
+        phi::errors::InvalidArgument(
+            "Your custom operator uses `SetInplaceMap` and sets the "
+            "InferDtypeFn. However, `Outputs` size = %d does not match the "
+            "`InplaceMap size + InferDtypeFn output size` = %d. Please check "
+            "InplaceMap and InferDtypeFn again",
+            outputs.size(),
+            output_dtypes.size() + inplace_map.size()));
+  }
+
+  VLOG(3)
+      << "Custom Operator: InferDtype - set output dtype: inplace_map.size() = "
+      << inplace_map.size()
+      << ", output_dtypes.size() = " << output_dtypes.size();
+  size_t output_dtype_idx = 0;
  for (size_t i = 0; i < outputs.size(); ++i) {
    auto out_name = outputs[i];
    if (detail::IsDuplicableVar(out_name)) {
-      std::vector<DDim> vec_ddim;
-      vec_ddim.reserve(output_shapes.size());
-      std::transform(output_shapes.begin(),
-                     output_shapes.end(),
-                     std::back_inserter(vec_ddim),
-                     [&](const std::vector<int64_t>& shape) -> DDim {
-                       return phi::make_ddim(shape);
-                     });
-      ctx->SetOutputsDim(out_name, vec_ddim);
+      PADDLE_ENFORCE(
+          inplace_reverse_map.find(out_name) != inplace_reverse_map.end(),
+          phi::errors::InvalidArgument(
+              "Custom operator only supports `paddle::Vec(...)` inputs and "
+              "cannot support `paddle::Vec(...)` output without setting "
+              "InplaceMap. If you have to use `paddle::Vec(...)` output, "
+              "please indicate it by setting InplaceMap manully."));
+      auto in_name = inplace_reverse_map.at(out_name);
+      ctx->SetOutputDataTypes(out_name, ctx->GetInputDataTypes(in_name));
    } else {
-      ctx->SetOutputDim(out_name, phi::make_ddim(output_shapes[i]));
+      if (inplace_reverse_map.find(out_name) != inplace_reverse_map.end()) {
+        auto in_name = inplace_reverse_map.at(out_name);
+        // Share dtype between inplace inputs and outputs
+        ctx->SetOutputDataType(out_name, ctx->GetInputDataType(in_name));
+      } else {
+        // Set output dtype by the output of InferDtypeFn
+        ctx->SetOutputDataType(out_name,
+                               paddle::framework::TransToProtoVarType(
+                                   output_dtypes[output_dtype_idx++]));
+      }
    }
  }
 }
@@ -822,6 +1076,8 @@ void RegisterOperatorWithMetaInfo(const std::vector<OpMetaInfo>& op_meta_infos,
  auto& op_outputs = OpMetaInfoHelper::GetOutputs(base_op_meta);
  auto& op_attrs = OpMetaInfoHelper::GetAttrs(base_op_meta);
  auto& op_inplace_map = OpMetaInfoHelper::GetInplaceMap(base_op_meta);
+  auto& op_inplace_reverse_map =
+      OpMetaInfoHelper::GetInplaceReverseMap(base_op_meta);
  auto& kernel_fn = OpMetaInfoHelper::GetKernelFn(base_op_meta);
  auto& infer_shape_func = OpMetaInfoHelper::GetInferShapeFn(base_op_meta);
  auto& infer_dtype_func = OpMetaInfoHelper::GetInferDtypeFn(base_op_meta);
@@ -873,133 +1129,46 @@ void RegisterOperatorWithMetaInfo(const std::vector<OpMetaInfo>& op_meta_infos,
  // InferShape
  if (infer_shape_func == nullptr) {
    // use default InferShape
-    info.infer_shape_ = [op_inputs, op_outputs](InferShapeContext* ctx) {
-      PADDLE_ENFORCE_EQ(
-          op_inputs.size(),
-          1UL,
-          platform::errors::Unavailable(
-              "Your custom operator contains multiple inputs. "
-              "We only allow a custom operator that contains only one input "
-              "and only one output without setting the InferShapeFn. "
-              "At this time, the input shape will be directly set to "
-              "the output shape.\n"
-              "Please set the InferShapeFn of custom "
-              "operator by .SetInferShapeFn(PD_INFER_SHAPE(...))"));
-      PADDLE_ENFORCE_EQ(
-          op_outputs.size(),
-          1UL,
-          platform::errors::Unavailable(
-              "Your custom operator contains multiple outputs. "
-              "We only allow a custom operator that contains only one input "
-              "and only one output without setting the InferShapeFn. "
-              "At this time, the input shape will be directly set to "
-              "the output shape.\n"
-              "Please set the InferShapeFn of custom "
-              "operator by .SetInferShapeFn(PD_INFER_SHAPE(...))"));
-
-      VLOG(3) << "Custom Operator: Default InferShape - share ddim.";
-      ctx->ShareDim(op_inputs[0], op_outputs[0]);
+    info.infer_shape_ =
+        [op_inputs, op_outputs, op_inplace_map](InferShapeContext* ctx) {
+          RunDefaultInferShapeFunc(ctx, op_inputs, op_outputs, op_inplace_map);
        };
  } else {
-    info.infer_shape_ = [op_inputs, op_outputs, op_attrs, infer_shape_func](
-                            InferShapeContext* ctx) {
-      RunInferShapeFunc(ctx, infer_shape_func, op_inputs, op_outputs, op_attrs);
+    info.infer_shape_ = [op_inputs,
+                         op_outputs,
+                         op_attrs,
+                         op_inplace_map,
+                         op_inplace_reverse_map,
+                         infer_shape_func](InferShapeContext* ctx) {
+      RunInferShapeFunc(ctx,
+                        infer_shape_func,
+                        op_inputs,
+                        op_outputs,
+                        op_attrs,
+                        op_inplace_map,
+                        op_inplace_reverse_map);
    };
  }

  // Infer Dtype
  if (infer_dtype_func == nullptr) {
    // use default InferDtype
-    info.infer_var_type_ = [op_inputs, op_outputs](InferVarTypeContext* ctx) {
-      PADDLE_ENFORCE_EQ(
-          op_inputs.size(),
-          1UL,
-          platform::errors::Unavailable(
-              "Your custom operator contains multiple inputs. "
-              "We only allow a custom operator that contains only one input "
-              "and only one output without setting the InferDtypeFn. "
-              "At this time, the input dtype will be directly set to "
-              "the output dtype.\n"
-              "Please set the InferDtypeFn of custom "
-              "operator by .SetInferDtypeFn(PD_INFER_DTYPE(...))"));
-      PADDLE_ENFORCE_EQ(
-          op_outputs.size(),
-          1UL,
-          platform::errors::Unavailable(
-              "Your custom operator contains multiple outputs. "
-              "We only allow a custom operator that contains only one input "
-              "and only one output without setting the InferDtypeFn. "
-              "At this time, the input dtype will be directly set to "
-              "the output dtype.\n"
-              "Please set the InferDtypeFn of custom "
-              "operator by .SetInferDtypeFn(PD_INFER_DTYPE(...))"));
-
-      VLOG(3) << "Custom Operator: InferDtype - share dtype.";
-      auto dtype = ctx->GetInputDataType(op_inputs[0]);
-      ctx->SetOutputDataType(op_outputs[0], dtype);
-    };
-  } else {
    info.infer_var_type_ =
-        [op_inputs, op_outputs, infer_dtype_func](InferVarTypeContext* ctx) {
-          std::vector<DataType> input_dtypes;
-          std::vector<std::vector<DataType>> vec_input_dtypes;
-
-          VLOG(3) << "Custom Operator: InferDtype - get input dtype.";
-          for (auto& in_name : op_inputs) {
-            if (detail::IsDuplicableVar(in_name)) {
-              std::vector<DataType> vec_custom_dtype;
-              if (ctx->HasInput(in_name)) {  // general inputs
-                for (size_t i = 0; i < ctx->InputSize(in_name); ++i) {
-                  auto dtype = ctx->GetInputDataType(in_name, i);
-                  vec_custom_dtype.emplace_back(
-                      paddle::framework::TransToPhiDataType(dtype));
-                }
-              } else {  // optional inputs, `vec_custom_dtype` is empty
-                PADDLE_ENFORCE(
-                    detail::IsOptionalVar(in_name),
-                    phi::errors::NotFound("Your custom operator's InferDtypeFn "
-                                          "cannot find input parameter `%s`",
-                                          in_name));
-                VLOG(3) << "Custom Operator: InferDtypeFn's vector input "
-                        << in_name << " is optional dtype with None input";
-              }
-              vec_input_dtypes.emplace_back(vec_custom_dtype);
-            } else {
-              if (ctx->HasInput(in_name)) {  // general inputs
-                auto dtype = ctx->GetInputDataType(in_name);
-                input_dtypes.emplace_back(
-                    paddle::framework::TransToPhiDataType(dtype));
-              } else {  // optional inputs
-                PADDLE_ENFORCE(
-                    detail::IsOptionalVar(in_name),
-                    phi::errors::NotFound("Your custom operator's InferDtypeFn "
-                                          "cannot find input parameter `%s`",
-                                          in_name));
-                input_dtypes.emplace_back(DataType::UNDEFINED);
-                VLOG(3) << "Custom Operator: InferDtypeFn's input " << in_name
-                        << " is optional dtype with None input";
-              }
-            }
-          }
-
-          VLOG(3) << "Custom Operator: InferDtype - infer output dtype.";
-          auto output_dtypes = infer_dtype_func(input_dtypes, vec_input_dtypes);
-
-          VLOG(3) << "Custom Operator: InferDtype - set output dtype.";
-          for (size_t i = 0; i < op_outputs.size(); ++i) {
-            auto out_name = op_outputs[i];
-            if (detail::IsDuplicableVar(out_name)) {
-              for (size_t j = 0; j < output_dtypes.size(); ++j) {
-                auto dtype =
-                    paddle::framework::TransToProtoVarType(output_dtypes[i]);
-                ctx->SetOutputDataType(out_name, dtype, j);
-              }
+        [op_inputs, op_outputs, op_inplace_map](InferVarTypeContext* ctx) {
+          RunDefaultInferDtypeFunc(ctx, op_inputs, op_outputs, op_inplace_map);
+        };
  } else {
-              ctx->SetOutputDataType(
-                  out_name,
-                  paddle::framework::TransToProtoVarType(output_dtypes[i]));
-            }
-          }
+    info.infer_var_type_ = [op_inputs,
+                            op_outputs,
+                            op_inplace_map,
+                            op_inplace_reverse_map,
+                            infer_dtype_func](InferVarTypeContext* ctx) {
+      RunInferDtypeFunc(ctx,
+                        infer_dtype_func,
+                        op_inputs,
+                        op_outputs,
+                        op_inplace_map,
+                        op_inplace_reverse_map);
    };
  }

@@ -1022,6 +1191,8 @@ void RegisterOperatorWithMetaInfo(const std::vector<OpMetaInfo>& op_meta_infos,
    auto& grad_op_outputs = OpMetaInfoHelper::GetOutputs(cur_grad_op);
    auto& grad_op_attrs = OpMetaInfoHelper::GetAttrs(cur_grad_op);
    auto& grad_op_inplace_map = OpMetaInfoHelper::GetInplaceMap(cur_grad_op);
+    auto& grad_op_inplace_reverse_map =
+        OpMetaInfoHelper::GetInplaceReverseMap(cur_grad_op);
    auto& grad_kernel_fn = OpMetaInfoHelper::GetKernelFn(cur_grad_op);
    auto& grad_infer_shape_fn = OpMetaInfoHelper::GetInferShapeFn(cur_grad_op);

@@ -1092,6 +1263,13 @@ void RegisterOperatorWithMetaInfo(const std::vector<OpMetaInfo>& op_meta_infos,
      return new CustomOperator(type, inputs, outputs, attrs);
    };

+    // Inplace
+    if (!grad_op_inplace_map.empty()) {
+      grad_info.infer_inplace_ = [grad_op_inplace_map](bool use_cuda) {
+        return grad_op_inplace_map;
+      };
+    }
+
    // Grad InferShape
    if (grad_infer_shape_fn == nullptr) {
      grad_info.infer_shape_ = [grad_op_inputs,
@@ -1135,12 +1313,16 @@ void RegisterOperatorWithMetaInfo(const std::vector<OpMetaInfo>& op_meta_infos,
      grad_info.infer_shape_ = [grad_op_inputs,
                                grad_op_outputs,
                                grad_op_attrs,
+                                grad_op_inplace_map,
+                                grad_op_inplace_reverse_map,
                                grad_infer_shape_fn](InferShapeContext* ctx) {
        RunInferShapeFunc(ctx,
                          grad_infer_shape_fn,
                          grad_op_inputs,
                          grad_op_outputs,
-                          grad_op_attrs);
+                          grad_op_attrs,
+                          grad_op_inplace_map,
+                          grad_op_inplace_reverse_map);
      };
    }


--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -518,6 +518,7 @@ static PyObject* eager_api_run_custom_op(PyObject* self,
            "sure you registered your op first and try again. ",
            op_type));
    VLOG(7) << "Run Kernel of Custom Op: " << op_type;
+    // TODO(HongyuJia): Optimize Attrs Cast naming and implementation
    std::vector<paddle::any> res_attrs = CastAttrsToTargetType(
        ctx.Attrs(),
        paddle::OpMetaInfoHelper::GetAttrs(meta_info_map.at(op_type)[0]));

--- a/paddle/phi/api/ext/op_meta_info.h
+++ b/paddle/phi/api/ext/op_meta_info.h
@@ -196,6 +196,7 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
  template <typename... RemainingArgs>
  struct ComputeCallHelper;

+  // Handle args for general Tensor input case
  template <typename... Tail>
  struct ComputeCallHelper<const Tensor&, Tail...> {
    template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
@@ -209,6 +210,7 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
    }
  };

+  // Handle args for optional Tensor input case
  template <typename... Tail>
  struct ComputeCallHelper<const paddle::optional<paddle::Tensor>&, Tail...> {
    template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
@@ -228,6 +230,7 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
    }
  };

+  // Handle args for general vector<Tensor> input case
  template <typename... Tail>
  struct ComputeCallHelper<const std::vector<Tensor>&, Tail...> {
    template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
@@ -241,6 +244,7 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
    }
  };

+  // Handle args for optional vector<Tensor> input case
  template <typename... Tail>
  struct ComputeCallHelper<const paddle::optional<std::vector<paddle::Tensor>>&,
                           Tail...> {
@@ -293,6 +297,7 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {

  // Used to be compatible with 2.3 released internal inplace interface, not
  // recommended
+  // Handle args for compatible inplace case
  template <typename... Tail>
  struct ComputeCallHelper<Tensor*, Tail...> {
    template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
@@ -310,6 +315,7 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
  // recommended
  // TODO(chenweihang): What is the appropriate output form?
  // std::vector<Tensor>*? or std::vector<Tensor*>? or std::vector<Tensor*>*
+  // Handle args for compatible inplace case
  template <typename... Tail>
  struct ComputeCallHelper<std::vector<Tensor*>, Tail...> {
    template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
@@ -323,7 +329,7 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
    }
  };

-  // Handle Tensor& for inplace case
+  // Handle args for inplace Tensor case
  template <typename... Tail>
  struct ComputeCallHelper<Tensor&, Tail...> {
    template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
@@ -337,6 +343,20 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
    }
  };

+  // Handle args for inplace vector<Tensor> case
+  template <typename... Tail>
+  struct ComputeCallHelper<std::vector<Tensor>&, Tail...> {
+    template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
+    static void Compute(CustomOpKernelContext* ctx, PreviousArgs&... pargs) {
+      auto& range = ctx->InputRangeAt(in_idx);
+      auto arg = ctx->InputsBetween(range.first, range.second);
+      ComputeCallHelper<
+          Tail...>::template Compute<in_idx + 1, attr_idx, out_idx>(ctx,
+                                                                    pargs...,
+                                                                    arg);
+    }
+  };
+
  template <int out_idx, typename T>
  struct ComputeReturnHelper;

@@ -739,6 +759,7 @@ class PADDLE_API OpMetaInfo {
  std::vector<std::string> outputs_;
  std::vector<std::string> attrs_;
  std::unordered_map<std::string, std::string> inplace_map_;
+  std::unordered_map<std::string, std::string> inplace_reverse_map_;
  // 2. func info
  KernelFunc kernel_fn_{nullptr};
  InferShapeFunc infer_shape_fn_{nullptr};
@@ -767,6 +788,10 @@ class OpMetaInfoHelper {
      const paddle::OpMetaInfo& info) {
    return info.inplace_map_;
  }
+  static const std::unordered_map<std::string, std::string>&
+  GetInplaceReverseMap(const paddle::OpMetaInfo& info) {
+    return info.inplace_reverse_map_;
+  }
  static const KernelFunc& GetKernelFn(const paddle::OpMetaInfo& info) {
    return info.kernel_fn_;
  }

--- a/paddle/phi/api/lib/op_meta_info.cc
+++ b/paddle/phi/api/lib/op_meta_info.cc
@@ -134,6 +134,7 @@ const std::pair<size_t, size_t>& CustomOpKernelContext::OutputRangeAt(

 // handle inplace mechanism
 // Find out non-inplace output tensors.
+// TODO(HongyuJia): Add cache for inplace_tensor_map_ to optimize performance
 void CustomOpKernelContext::MapPlainOutputs(
    const std::vector<std::string>& inputs,
    const std::vector<std::string>& outputs,
@@ -215,6 +216,9 @@ OpMetaInfo& OpMetaInfo::SetInplaceMap(
    std::unordered_map<std::string, std::string>&& inplace_map) {
  inplace_map_ =
      std::forward<std::unordered_map<std::string, std::string>>(inplace_map);
+  for (const auto& pair : inplace_map_) {
+    inplace_reverse_map_[pair.second] = pair.first;
+  }
  return *this;
 }
 OpMetaInfo& OpMetaInfo::SetKernelFn(KernelFunc&& func) {

--- a/python/paddle/fluid/tests/custom_op/custom_inplace.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_inplace.cc
@@ -19,18 +19,18 @@
 #include "paddle/extension.h"

 template <typename data_t>
-void add_forward_kernel(data_t* x_data, const data_t* y_data, int64_t numel) {
+void add_data_pointer(const data_t* x_data, data_t* out_data, int64_t numel) {
  for (size_t i = 0; i < numel; ++i) {
-    x_data[i] += y_data[i];
+    out_data[i] += x_data[i];
  }
 }

 template <typename data_t>
-void add_backward_kernel(data_t* y_grad_data,
-                         const data_t* out_grad_data,
+void assign_data_pointer(const data_t* x_data,
+                         data_t* out_data,
                         int64_t numel) {
  for (size_t i = 0; i < numel; ++i) {
-    y_grad_data[i] = out_grad_data[i];
+    out_data[i] = x_data[i];
  }
 }

@@ -54,23 +54,12 @@ void relu_backward_kernel(const data_t* out_data,
 void AddForward(paddle::Tensor& x, const paddle::Tensor& y) {  // NOLINT
  PD_CHECK(x.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");

-  PD_DISPATCH_FLOATING_TYPES(x.type(), "AddForward", ([&] {
-                               add_forward_kernel<data_t>(x.data<data_t>(),
-                                                          y.data<data_t>(),
-                                                          x.size());
+  PD_DISPATCH_FLOATING_TYPES(
+      x.type(), "AddForward", ([&] {
+        add_data_pointer<data_t>(y.data<data_t>(), x.data<data_t>(), x.size());
      }));
 }

-std::vector<paddle::DataType> AddInferDtype(const paddle::DataType& x_dtype,
-                                            const paddle::DataType& y_dtype) {
-  return {x_dtype};
-}
-
-std::vector<std::vector<int64_t>> AddInferShape(
-    const std::vector<int64_t>& x_shape, const std::vector<int64_t>& y_shape) {
-  return {x_shape};
-}
-
 std::vector<paddle::Tensor> AddBackward(const paddle::Tensor& x,
                                        const paddle::Tensor& y,
                                        paddle::Tensor& out_grad) {  // NOLINT
@@ -81,8 +70,8 @@ std::vector<paddle::Tensor> AddBackward(const paddle::Tensor& x,

  PD_DISPATCH_FLOATING_TYPES(
      out_grad.type(), "AddBackward", ([&] {
-        add_backward_kernel<data_t>(
-            y_grad.data<data_t>(), out_grad.data<data_t>(), out_grad.size());
+        assign_data_pointer<data_t>(
+            out_grad.data<data_t>(), y_grad.data<data_t>(), out_grad.size());
      }));

  return {y_grad};
@@ -92,9 +81,7 @@ PD_BUILD_OP(custom_add)
    .Inputs({"X", "Y"})
    .Outputs({"Out"})
    .SetInplaceMap({{"X", "Out"}})
-    .SetKernelFn(PD_KERNEL(AddForward))
-    .SetInferShapeFn(PD_INFER_SHAPE(AddInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(AddInferDtype));
+    .SetKernelFn(PD_KERNEL(AddForward));

 PD_BUILD_GRAD_OP(custom_add)
    .Inputs({"X", "Y", paddle::Grad("Out")})
@@ -102,6 +89,58 @@ PD_BUILD_GRAD_OP(custom_add)
    .SetInplaceMap({{paddle::Grad("Out"), paddle::Grad("X")}})
    .SetKernelFn(PD_KERNEL(AddBackward));

+// out[i] = x[i] + y
+void AddVectorForward(std::vector<paddle::Tensor>& x,  // NOLINT
+                      const paddle::Tensor& y) {
+  PD_CHECK(y.place() == paddle::PlaceType::kCPU, "y must be a CPU Tensor.");
+
+  PD_DISPATCH_FLOATING_TYPES(y.type(), "AddVectorForward", ([&] {
+                               for (size_t i = 0; i < x.size(); ++i) {
+                                 add_data_pointer<data_t>(y.data<data_t>(),
+                                                          x[i].data<data_t>(),
+                                                          y.size());
+                               }
+                             }));
+}
+
+// dout[i] / dx[i] = out_grad[i] (do not need any code, inplace automatically)
+// dout / dy = out_grad[0] + ... + out_grad[n - 1]
+std::vector<paddle::Tensor> AddVectorBackward(
+    const std::vector<paddle::Tensor>& x,
+    const paddle::Tensor& y,
+    std::vector<paddle::Tensor>& out_grad) {  // NOLINT
+  PD_CHECK(x[0].place() == paddle::PlaceType::kCPU,
+           "x[0] must be a CPU Tensor.");
+  PD_CHECK(y.place() == paddle::PlaceType::kCPU, "y must be a CPU Tensor.");
+  PD_CHECK(x.size() == out_grad.size(),
+           "x must have the same size as out_grad.");
+
+  paddle::Tensor y_grad = paddle::zeros(y.shape(), y.dtype(), y.place());
+
+  PD_DISPATCH_FLOATING_TYPES(
+      y.type(), "AddVectorBackward", ([&] {
+        // y_grad = out_grad[0] + ... + out_grad[n - 1]
+        for (size_t i = 0; i < out_grad.size(); ++i) {
+          add_data_pointer<data_t>(
+              out_grad[i].data<data_t>(), y_grad.data<data_t>(), y_grad.size());
+        }
+      }));
+  return {y_grad};
+}
+
+PD_BUILD_OP(custom_add_vec)
+    .Inputs({paddle::Vec("X"), "Y"})
+    .Outputs({paddle::Vec("Out")})
+    .SetInplaceMap({{paddle::Vec("X"), paddle::Vec("Out")}})
+    .SetKernelFn(PD_KERNEL(AddVectorForward));
+
+PD_BUILD_GRAD_OP(custom_add_vec)
+    .Inputs({paddle::Vec("X"), "Y", paddle::Grad(paddle::Vec("Out"))})
+    .Outputs({paddle::Grad(paddle::Vec("X")), paddle::Grad("Y")})
+    .SetInplaceMap({{paddle::Grad(paddle::Vec("Out")),
+                     paddle::Grad(paddle::Vec("X"))}})
+    .SetKernelFn(PD_KERNEL(AddVectorBackward));
+
 void MultiInplaceForward(paddle::Tensor& x,  // NOLINT
                         const paddle::Tensor& y,
                         paddle::Tensor& a,  // NOLINT
@@ -111,29 +150,11 @@ void MultiInplaceForward(paddle::Tensor& x,  // NOLINT

  PD_DISPATCH_FLOATING_TYPES(
      x.type(), "MultiInplaceForward", ([&] {
-        add_forward_kernel<data_t>(
-            x.data<data_t>(), y.data<data_t>(), x.size());
-        add_forward_kernel<data_t>(
-            a.data<data_t>(), b.data<data_t>(), a.size());
+        add_data_pointer<data_t>(y.data<data_t>(), x.data<data_t>(), x.size());
+        add_data_pointer<data_t>(b.data<data_t>(), a.data<data_t>(), a.size());
      }));
 }

-std::vector<paddle::DataType> MultiInplaceInferDtype(
-    const paddle::DataType& x_dtype,
-    const paddle::DataType& y_dtype,
-    const paddle::DataType& a_dtype,
-    const paddle::DataType& b_dtype) {
-  return {x_dtype, a_dtype};
-}
-
-std::vector<std::vector<int64_t>> MultiInplaceInferShape(
-    const std::vector<int64_t>& x_shape,
-    const std::vector<int64_t>& y_shape,
-    const std::vector<int64_t>& a_shape,
-    const std::vector<int64_t>& b_shape) {
-  return {x_shape, a_shape};
-}
-
 std::vector<paddle::Tensor> MultiInplaceBackward(
    const paddle::Tensor& x,
    const paddle::Tensor& y,
@@ -151,11 +172,11 @@ std::vector<paddle::Tensor> MultiInplaceBackward(

  PD_DISPATCH_FLOATING_TYPES(
      outxy_grad.type(), "MultiInplaceBackward", ([&] {
-        add_backward_kernel<data_t>(y_grad.data<data_t>(),
-                                    outxy_grad.data<data_t>(),
+        assign_data_pointer<data_t>(outxy_grad.data<data_t>(),
+                                    y_grad.data<data_t>(),
                                    outxy_grad.size());
-        add_backward_kernel<data_t>(b_grad.data<data_t>(),
-                                    outab_grad.data<data_t>(),
+        assign_data_pointer<data_t>(outab_grad.data<data_t>(),
+                                    b_grad.data<data_t>(),
                                    outab_grad.size());
      }));

@@ -166,9 +187,7 @@ PD_BUILD_OP(custom_multi_inplace)
    .Inputs({"X", "Y", "A", "B"})
    .Outputs({"OutXY", "OutAB"})
    .SetInplaceMap({{"X", "OutXY"}, {"A", "OutAB"}})
-    .SetKernelFn(PD_KERNEL(MultiInplaceForward))
-    .SetInferShapeFn(PD_INFER_SHAPE(MultiInplaceInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(MultiInplaceInferDtype));
+    .SetKernelFn(PD_KERNEL(MultiInplaceForward));

 PD_BUILD_GRAD_OP(custom_multi_inplace)
    .Inputs({"X", "Y", paddle::Grad("OutXY"), "A", "B", paddle::Grad("OutAB")})

--- a/python/paddle/fluid/tests/custom_op/test_custom_inplace.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_inplace.py
@@ -40,6 +40,54 @@ custom_inplace = load(
    verbose=True,
 )

+# Temporarily assemble custom python API
+import paddle.fluid.core as core
+from paddle.fluid.core import CustomOpKernelContext
+from paddle.fluid.framework import in_dygraph_mode
+from paddle.fluid.layer_helper import LayerHelper
+
+
+def custom_add_vec(x_vector, y):
+    # prepare inputs and outputs
+    attrs = {}
+    outs = {}
+    out_names = ["Out@VECTOR"]
+
+    # The output variable's dtype use default value 'float32',
+    # and the actual dtype of output variable will be inferred in runtime.
+    if in_dygraph_mode():
+        ctx = CustomOpKernelContext()
+        for i in [x_vector, y]:
+            ctx.add_inputs(i)
+        for out_name in out_names:
+            outs[out_name] = [core.eager.Tensor() for _ in range(len(x_vector))]
+            ctx.add_outputs(outs[out_name])
+        core.eager._run_custom_op(ctx, "custom_add_vec", True)
+    else:
+        ins = {}
+        for key, value in dict({"X@VECTOR": x_vector, "Y": y}).items():
+            # handle optional inputs
+            if value is not None:
+                ins[key] = value
+        helper = LayerHelper("custom_add_vec", **locals())
+        for out_name in out_names:
+            outs[out_name] = [
+                helper.create_variable(dtype='float32')
+                for _ in range(len(x_vector))
+            ]
+
+        helper.append_op(
+            type="custom_add_vec", inputs=ins, outputs=outs, attrs=attrs
+        )
+
+    res = [outs[out_name] for out_name in out_names]
+
+    return res[0] if len(res) == 1 else res
+
+
+# Set custom python API manually
+custom_inplace.custom_add_vec = custom_add_vec
+

 def inplace_dynamic_add(phi_func, device, dtype, np_x, np_y):
    paddle.set_device(device)
@@ -88,7 +136,89 @@ def inplace_static_add(func, device, dtype, np_x, np_y):
    return x_v, out_v, x_grad_v, y_grad_v, out_grad_v


-def inplace_dynamic_relu(phi_func, device, dtype, np_x, np_y, np_z):
+def inplace_dynamic_add_vector(phi_func, device, dtype, np_inputs, np_y):
+    paddle.set_device(device)
+    inputs = [
+        paddle.to_tensor(np_input, dtype=dtype, stop_gradient=True)
+        for np_input in np_inputs
+    ]
+    y = paddle.to_tensor(np_y, dtype=dtype, stop_gradient=False)
+    if phi_func:
+        out = custom_inplace.custom_add_vec(inputs, y)
+    else:
+        out = [x.add_(y) for x in inputs]
+
+    mean_out = paddle.mean(paddle.concat(out))
+    mean_out.backward()
+    return (
+        np.concatenate([input.numpy() for input in inputs]),
+        y.numpy(),
+        np.concatenate([o.numpy() for o in out]),
+        np.concatenate([input.grad.numpy() for input in inputs]),
+        y.grad.numpy(),
+    )
+
+
+def inplace_static_add_vector(phi_func, device, dtype, np_inputs, np_y):
+    paddle.enable_static()
+    paddle.set_device(device)
+    with static.scope_guard(static.Scope()):
+        with static.program_guard(static.Program()):
+            x1 = static.data(
+                name="x1", shape=[None, np_inputs[0].shape[1]], dtype=dtype
+            )
+            x2 = static.data(
+                name="x2", shape=[None, np_inputs[1].shape[1]], dtype=dtype
+            )
+            y = static.data(name="y", shape=[None, np_y.shape[1]], dtype=dtype)
+            x1.stop_gradient = False
+            x2.stop_gradient = False
+            y.stop_gradient = False
+            if phi_func:
+                out = custom_inplace.custom_add_vec([x1, x2], y)
+            else:
+                out = [paddle.add(x1, y), paddle.add(x2, y)]
+            mean_out = paddle.mean(paddle.concat(out))
+            static.append_backward(mean_out)
+
+            exe = static.Executor()
+            exe.run(static.default_startup_program())
+
+            (
+                out0_v,
+                out1_v,
+                x1_grad_v,
+                x2_grad_v,
+                y_grad_v,
+                out0_grad_v,
+                out1_grad_v,
+            ) = exe.run(
+                static.default_main_program(),
+                feed={
+                    "x1": np_inputs[0].astype(dtype),
+                    "x2": np_inputs[1].astype(dtype),
+                    "y": np_y.astype(dtype),
+                },
+                fetch_list=[
+                    out[0].name,
+                    out[1].name,
+                    x1.name + "@GRAD",
+                    x2.name + "@GRAD",
+                    y.name + "@GRAD",
+                    out[0].name + "@GRAD",
+                    out[1].name + "@GRAD",
+                ],
+            )
+    paddle.disable_static()
+    return (
+        [out0_v, out1_v],
+        [x1_grad_v, x2_grad_v],
+        y_grad_v,
+        [out0_grad_v, out1_grad_v],
+    )
+
+
+def inplace_dynamic_relu_net(phi_func, device, dtype, np_x, np_y, np_z):
    paddle.set_device(device)
    x = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False)
    y = paddle.to_tensor(np_y, dtype=dtype, stop_gradient=False)
@@ -107,7 +237,7 @@ def inplace_dynamic_relu(phi_func, device, dtype, np_x, np_y, np_z):
    return x.numpy(), y.numpy(), out.numpy(), x.grad.numpy(), y.grad.numpy()


-def inplace_static_relu(func, device, dtype, np_x, np_y, np_z):
+def inplace_static_relu_net(func, device, dtype, np_x, np_y, np_z):
    paddle.enable_static()
    paddle.set_device(device)
    with static.scope_guard(static.Scope()):
@@ -255,6 +385,10 @@ class TestCustomInplaceJit(unittest.TestCase):
        self.np_z = np.random.random((3, 2)).astype("float32")
        self.np_a = np.random.random((3, 2)).astype("float32")
        self.np_b = np.random.random((3, 2)).astype("float32")
+        self.np_inputs = [
+            np.random.random((3, 2)).astype("float32"),
+            np.random.random((3, 2)).astype("float32"),
+        ]

    def check_output(self, out, pd_out, name):
        np.testing.assert_array_equal(
@@ -354,7 +488,79 @@ class TestCustomInplaceJit(unittest.TestCase):
                self.check_output(phi_x_grad, pd_x_grad, "x_grad")
                self.check_output(phi_y_grad, pd_y_grad, "y_grad")

-    def test_static_multiple_inplace_relu(self):
+    def test_static_add_vector(self):
+        for device in self.devices:
+            for dtype in self.dtypes:
+                (
+                    pd_out,
+                    pd_x_grad,
+                    pd_y_grad,
+                    pd_out_grad,
+                ) = inplace_static_add_vector(
+                    True,
+                    device,
+                    dtype,
+                    self.np_inputs,
+                    self.np_y,
+                )
+                (
+                    phi_out,
+                    phi_x_grad,
+                    phi_y_grad,
+                    phi_out_grad,
+                ) = inplace_static_add_vector(
+                    False,
+                    device,
+                    dtype,
+                    self.np_inputs,
+                    self.np_y,
+                )
+
+                self.check_output(phi_out, pd_out, "out")
+                self.check_output(phi_x_grad, pd_x_grad, "x_grad")
+                self.check_output(phi_y_grad, pd_y_grad, "y_grad")
+                self.check_output(phi_out_grad, pd_out_grad, "out_grad")
+
+    def test_dynamic_add_vector(self):
+        for device in self.devices:
+            for dtype in self.dtypes:
+                (
+                    pd_x,
+                    pd_y,
+                    pd_out,
+                    pd_x_grad,
+                    pd_y_grad,
+                ) = inplace_dynamic_add_vector(
+                    True,
+                    device,
+                    dtype,
+                    self.np_inputs,
+                    self.np_y,
+                )
+                (
+                    phi_x,
+                    phi_y,
+                    phi_out,
+                    phi_x_grad,
+                    phi_y_grad,
+                ) = inplace_dynamic_add_vector(
+                    False,
+                    device,
+                    dtype,
+                    self.np_inputs,
+                    self.np_y,
+                )
+
+                self.check_output(phi_x, phi_out, "inplace_phi_x")
+                self.check_output(pd_x, pd_out, "inplace_pd_x")
+
+                self.check_output(phi_x, pd_x, "x")
+                self.check_output(phi_y, pd_y, "y")
+                self.check_output(phi_out, pd_out, "out")
+                self.check_output(phi_x_grad, pd_x_grad, "x_grad")
+                self.check_output(phi_y_grad, pd_y_grad, "y_grad")
+
+    def test_static_relu_net(self):
        for device in self.devices:
            for dtype in self.dtypes:
                (
@@ -363,7 +569,7 @@ class TestCustomInplaceJit(unittest.TestCase):
                    pd_out,
                    pd_x_grad,
                    pd_y_grad,
-                ) = inplace_static_relu(
+                ) = inplace_static_relu_net(
                    paddle.nn.functional.relu,
                    device,
                    dtype,
@@ -377,7 +583,7 @@ class TestCustomInplaceJit(unittest.TestCase):
                    phi_out,
                    phi_x_grad,
                    phi_y_grad,
-                ) = inplace_static_relu(
+                ) = inplace_static_relu_net(
                    custom_inplace.custom_relu_inplace,
                    device,
                    dtype,
@@ -391,7 +597,7 @@ class TestCustomInplaceJit(unittest.TestCase):
                self.check_output_allclose(phi_x_grad, pd_x_grad, "x_grad")
                self.check_output_allclose(phi_y_grad, pd_y_grad, "y_grad")

-    def test_dynamic_multiple_inplace_relu(self):
+    def test_dynamic_relu_net(self):
        for device in self.devices:
            for dtype in self.dtypes:
                (
@@ -400,7 +606,7 @@ class TestCustomInplaceJit(unittest.TestCase):
                    pd_out,
                    pd_x_grad,
                    pd_y_grad,
-                ) = inplace_dynamic_relu(
+                ) = inplace_dynamic_relu_net(
                    False,
                    device,
                    dtype,
@@ -414,7 +620,7 @@ class TestCustomInplaceJit(unittest.TestCase):
                    phi_out,
                    phi_x_grad,
                    phi_y_grad,
-                ) = inplace_dynamic_relu(
+                ) = inplace_dynamic_relu_net(
                    True,
                    device,
                    dtype,