[Opt Performance] Optimize custom operator performance (#52597)

* [Opt Performance] Optimize custom operator performance, reconstruct python API auto-gen, add cache and use const inference * opt AutoGradMeta implementation * remove profiler codes * fix unit test * change year, 2021->2023 * fix int64_t parse bug

[Opt Performance] Optimize custom operator performance (#52597)
* [Opt Performance] Optimize custom operator performance, reconstruct python API auto-gen, add cache and use const inference * opt AutoGradMeta implementation * remove profiler codes * fix unit test * change year, 2021->2023 * fix int64_t parse bug
01247e33 · HongyuJia · GitHub · 90c3bddf · 01247e33 · 01247e33
11 changed file
--- a/paddle/fluid/eager/custom_operator/custom_operator_node.cc
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
@@ -236,7 +236,8 @@ RunCustomOpNode::operator()(paddle::small_vector<std::vector<paddle::Tensor>,
  VLOG(7) << "Run Kernel of Grad Custom Op: " << op_type_ << "_grad";
  // handle inplace map
-  ctx.MapPlainOutputs(grad_inputs_name, grad_outputs_names, grad_inplace_map);
+  ctx.UpdatePlainOutputs(
+      grad_inputs_name, grad_outputs_names, grad_inplace_map);
  (*paddle::OpMetaInfoHelper::GetKernelFn(kernel_map.at(op_type_)[1]))(&ctx);
  ctx.AssignInplaceOutputs();
@@ -443,7 +444,8 @@ RunCustomOpDoubleGradNode::operator()(
  VLOG(7) << "Run Kernel of Grad Custom Op: " << name();
  // handle inplace map
-  ctx.MapPlainOutputs(grad_inputs_name, grad_outputs_names, grad_inplace_map);
+  ctx.UpdatePlainOutputs(
+      grad_inputs_name, grad_outputs_names, grad_inplace_map);
  (*paddle::OpMetaInfoHelper::GetKernelFn(kernel_map.at(op_type_)[2]))(&ctx);
  ctx.AssignInplaceOutputs();

--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -28,6 +28,7 @@ limitations under the License. */
 #include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/custom_operator_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/phi_utils.h"
@@ -52,87 +53,6 @@ DECLARE_string(tensor_operants_mode);
 namespace paddle {
 namespace framework {
-namespace detail {
-// dynamic lib load func
-template <typename T>
-static T* DynLoad(void* handle, std::string name) {
-  T* func = reinterpret_cast<T*>(dlsym(handle, name.c_str()));
-#if !defined(_WIN32)
-  auto errorno = dlerror();
-#else
-  auto errorno = GetLastError();
-#endif  // !_WIN32
-  PADDLE_ENFORCE_NOT_NULL(
-      func,
-      platform::errors::NotFound(
-          "Failed to load dynamic operator library, error message(%s).",
-          errorno));
-  return func;
-}
-inline static bool IsDuplicableVar(const std::string& var_name) {
-  std::string suffix = kTensorVectorSuffix;
-  return var_name.rfind(suffix) != std::string::npos;
-}
-inline static bool IsOptionalVar(const std::string& var_name) {
-  std::string suffix = kOptionalSuffix;
-  return var_name.rfind(suffix) != std::string::npos;
-}
-inline static std::string NoGrad(const std::string& var_name,
-                                 bool is_double_grad = false) {
-  std::string suffix = kGradVarSuffix;
-  std::string new_out_suffix = kDoubleGradNewOutSuffix;
-  std::string tmp_var_name(var_name);
-  if (is_double_grad &&
-      (tmp_var_name.rfind(new_out_suffix) != std::string::npos)) {
-    tmp_var_name = tmp_var_name.substr(
-        0, tmp_var_name.size() - /*kDoubleGradNewOutSuffix length*/ 4);
-  }
-  return tmp_var_name.substr(0, tmp_var_name.size() - kGradVarSuffixSize);
-}
-inline static bool IsGradVar(const std::string& var_name, bool is_double_grad) {
-  std::string suffix = kGradVarSuffix;
-  if (!is_double_grad) {
-    return var_name.rfind(suffix) != std::string::npos;
-  } else {
-    // for double grad cases, the X@GRAD is not a grad var, X@GRAD@GRAD is a
-    // grad var, here we remove a @GRAD suffix
-    return NoGrad(var_name).rfind(suffix) != std::string::npos;
-  }
-}
-inline static bool IsMemberOf(const std::vector<std::string>& vec,
-                              const std::string& name) {
-  return std::find(vec.cbegin(), vec.cend(), name) != vec.cend();
-}
-static std::vector<std::string> ParseAttrStr(const std::string& attr) {
-  auto split_pos = attr.find_first_of(":");
-  PADDLE_ENFORCE_NE(split_pos,
-                    std::string::npos,
-                    platform::errors::InvalidArgument(
-                        "Invalid attribute string format. Attribute string "
-                        "format is `<name>:<type>`."));
-  std::vector<std::string> rlt;
-  // 1. name
-  rlt.emplace_back(string::trim_spaces(attr.substr(0, split_pos)));
-  // 2. type
-  rlt.emplace_back(string::trim_spaces(attr.substr(split_pos + 1)));
-  VLOG(3) << "attr name: " << rlt[0] << ", attr type str: " << rlt[1];
-  return rlt;
-}
-}  // namespace detail
-////////////////// Kernel Define ////////////////////
 // custom op kernel call function define
 static void RunKernelFunc(
    const framework::ExecutionContext& ctx,
@@ -355,7 +275,7 @@ static void RunKernelFunc(
    }
    // handle inplace map
-    kernel_ctx.MapPlainOutputs(inputs, outputs, inplace_map);
+    kernel_ctx.UpdatePlainOutputs(inputs, outputs, inplace_map);
    func(&kernel_ctx);
    kernel_ctx.AssignInplaceOutputs();

--- a/paddle/fluid/framework/custom_operator_utils.h
+++ b/paddle/fluid/framework/custom_operator_utils.h
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/string/string_helper.h"
+#include "paddle/phi/api/ext/op_meta_info.h"
+namespace paddle {
+namespace framework {
+namespace detail {
+// dynamic lib load func
+template <typename T>
+static T* DynLoad(void* handle, std::string name) {
+  T* func = reinterpret_cast<T*>(dlsym(handle, name.c_str()));
+#if !defined(_WIN32)
+  auto errorno = dlerror();
+#else
+  auto errorno = GetLastError();
+#endif  // !_WIN32
+  PADDLE_ENFORCE_NOT_NULL(
+      func,
+      platform::errors::NotFound(
+          "Failed to load dynamic operator library, error message(%s).",
+          errorno));
+  return func;
+}
+inline static bool IsDuplicableVar(const std::string& var_name) {
+  std::string suffix = kTensorVectorSuffix;
+  return var_name.rfind(suffix) != std::string::npos;
+}
+inline static bool IsOptionalVar(const std::string& var_name) {
+  std::string suffix = kOptionalSuffix;
+  return var_name.rfind(suffix) != std::string::npos;
+}
+inline static std::string NoGrad(const std::string& var_name,
+                                 bool is_double_grad = false) {
+  std::string suffix = kGradVarSuffix;
+  std::string new_out_suffix = kDoubleGradNewOutSuffix;
+  std::string tmp_var_name(var_name);
+  if (is_double_grad &&
+      (tmp_var_name.rfind(new_out_suffix) != std::string::npos)) {
+    tmp_var_name = tmp_var_name.substr(
+        0, tmp_var_name.size() - /*kDoubleGradNewOutSuffix length*/ 4);
+  }
+  return tmp_var_name.substr(0, tmp_var_name.size() - kGradVarSuffixSize);
+}
+inline static bool IsGradVar(const std::string& var_name, bool is_double_grad) {
+  std::string suffix = kGradVarSuffix;
+  if (!is_double_grad) {
+    return var_name.rfind(suffix) != std::string::npos;
+  } else {
+    // for double grad cases, the X@GRAD is not a grad var, X@GRAD@GRAD is a
+    // grad var, here we remove a @GRAD suffix
+    return NoGrad(var_name).rfind(suffix) != std::string::npos;
+  }
+}
+inline static bool IsMemberOf(const std::vector<std::string>& vec,
+                              const std::string& name) {
+  return std::find(vec.cbegin(), vec.cend(), name) != vec.cend();
+}
+static std::vector<std::string> ParseAttrStr(const std::string& attr) {
+  auto split_pos = attr.find_first_of(":");
+  PADDLE_ENFORCE_NE(split_pos,
+                    std::string::npos,
+                    platform::errors::InvalidArgument(
+                        "Invalid attribute string format. Attribute string "
+                        "format is `<name>:<type>`."));
+  std::vector<std::string> rlt;
+  // 1. name
+  rlt.emplace_back(string::trim_spaces(attr.substr(0, split_pos)));
+  // 2. type
+  rlt.emplace_back(string::trim_spaces(attr.substr(split_pos + 1)));
+  VLOG(3) << "attr name: " << rlt[0] << ", attr type str: " << rlt[1];
+  return rlt;
+}
+}  // namespace detail
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -33,6 +33,7 @@ typedef SSIZE_T ssize_t;
 #include "paddle/fluid/eager/utils.h"
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/custom_operator.h"
+#include "paddle/fluid/framework/custom_operator_utils.h"
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/python_headers.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
@@ -43,6 +44,7 @@ typedef SSIZE_T ssize_t;
 #include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/fluid/pybind/exception.h"
+#include "paddle/fluid/pybind/op_function_common.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
@@ -424,55 +426,6 @@ static void ConstructFwdAndBwdMap(
  }
 }
-static std::vector<paddle::any> CastAttrsToTargetType(
-    const std::vector<paddle::any>& src,
-    const std::vector<std::string>& attrs_names) {
-  std::vector<paddle::any> res;
-  PADDLE_ENFORCE_EQ(src.size(),
-                    attrs_names.size(),
-                    paddle::platform::errors::InvalidArgument(
-                        "We Expected same size of attrs and attrs_name list, "
-                        "if u got this error indicate your custom op setting "
-                        "%s attrs, but you just give %s",
-                        attrs_names.size(),
-                        src.size()));
-  for (size_t i = 0; i < src.size(); i++) {
-    size_t end = attrs_names[i].find(": ");
-    std::string type_name = attrs_names[i].substr(end + 2);
-    if (type_name == "int") {
-      if (src[i].type() == typeid(bool)) {
-        res.emplace_back(static_cast<int>(paddle::any_cast<bool>(src[i])));
-      } else if (src[i].type() == typeid(int)) {
-        res.emplace_back(src[i]);
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Your No. %s attrs should only can be bool or int32, other type is "
-            "forbidden for now but we got %s. Check your code first please",
-            i,
-            src[i].type().name()));
-      }
-    } else if (type_name == "int64_t") {
-      if (src[i].type() == typeid(bool)) {
-        res.emplace_back(static_cast<int64_t>(paddle::any_cast<bool>(src[i])));
-      } else if (src[i].type() == typeid(int)) {
-        res.emplace_back(static_cast<int64_t>(paddle::any_cast<int>(src[i])));
-      } else if (src[i].type() == typeid(int64_t)) {
-        res.emplace_back(src[i]);
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Your No. %s attrs should only can be bool or int32 or int64_t, "
-            "other type is forbidden for now but we got %s. Check your code "
-            "first please",
-            i,
-            src[i].type().name()));
-      }
-    } else {
-      res.emplace_back(src[i]);
-    }
-  }
-  return res;
-}
 static PyObject* eager_api_jit_function_call(PyObject* self,
                                             PyObject* args,
                                             PyObject* kwargs) {
@@ -534,6 +487,25 @@ static PyObject* eager_api__get_custom_operator_inplace_reverse_idx(
  EAGER_CATCH_AND_THROW_RETURN_NULL
 }
+// This function copies from function `EmptyTensorInitializer` with default
+// parameters
+static Tensor InitializedEmptyTensor() {
+  auto ddims = phi::make_ddim({0});
+  auto tensor = paddle::Tensor();
+  tensor.set_name(
+      egr::Controller::Instance().GenerateUniqueName("generated_tensor"));
+  auto autograd_meta = egr::EagerUtils::autograd_meta(&tensor);
+  autograd_meta->SetPersistable(false);
+  std::shared_ptr<phi::DenseTensor> dense_tensor = nullptr;
+  std::shared_ptr<phi::Allocation> allocation_ptr = nullptr;
+  dense_tensor = std::make_shared<phi::DenseTensor>(
+      allocation_ptr, phi::DenseTensorMeta(phi::DataType::FLOAT32, ddims));
+  tensor.set_impl(dense_tensor);
+  autograd_meta->SetGradNode(
+      std::make_shared<egr::GradNodeAccumulation>(autograd_meta));
+  return tensor;
+}
 static PyObject* eager_api_run_custom_op(PyObject* self,
                                         PyObject* args,
                                         PyObject* kwargs) {
@@ -545,14 +517,11 @@ static PyObject* eager_api_run_custom_op(PyObject* self,
    VLOG(4) << "Initialize phi tensor operants successfully";
  }
-  paddle::CustomOpKernelContext ctx =
+  std::string op_type = CastPyArg2AttrString(PyTuple_GET_ITEM(args, 0), 0);
-      CastPyArg2CustomOpKernelContext(PyTuple_GET_ITEM(args, 0), 0);
+  VLOG(7) << "Get things from python for Custom Op: " << op_type;
-  std::string op_type = CastPyArg2AttrString(PyTuple_GET_ITEM(args, 1), 1);
+  paddle::CustomOpKernelContext ctx;
-  bool trace_backward = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2);
  {
    eager_gil_scoped_release guard;
-    VLOG(7) << "Get things for python for Custom Op: " << op_type
-            << ", trace_backward is: " << trace_backward;
    auto meta_info_map = egr::Controller::Instance().GetOpMetaInfoMap();
    PADDLE_ENFORCE_NE(
        meta_info_map.find(op_type),
@@ -562,40 +531,138 @@ static PyObject* eager_api_run_custom_op(PyObject* self,
            "created by LoadOpMetaInfoAndRegisterOp, please make "
            "sure you registered your op first and try again. ",
            op_type));
-    VLOG(7) << "Run Kernel of Custom Op: " << op_type;
-    // TODO(HongyuJia): Optimize Attrs Cast naming and implementation
-    std::vector<paddle::any> res_attrs = CastAttrsToTargetType(
-        ctx.Attrs(),
-        paddle::OpMetaInfoHelper::GetAttrs(meta_info_map.at(op_type)[0]));
-    ctx.EmplaceBackAttrs(res_attrs);
    const auto& vec_map = meta_info_map.at(op_type);
+    const auto& inputs = paddle::OpMetaInfoHelper::GetInputs(vec_map[0]);
-    const auto& inputs =
+    const auto& attrs = paddle::OpMetaInfoHelper::GetAttrs(vec_map[0]);
-        paddle::OpMetaInfoHelper::GetInputs(meta_info_map.at(op_type)[0]);
+    const auto& outputs = paddle::OpMetaInfoHelper::GetOutputs(vec_map[0]);
-    const auto& outputs =
-        paddle::OpMetaInfoHelper::GetOutputs(meta_info_map.at(op_type)[0]);
    const auto& inplace_map =
-        paddle::OpMetaInfoHelper::GetInplaceMap(meta_info_map.at(op_type)[0]);
+        paddle::OpMetaInfoHelper::GetInplaceMap(vec_map[0]);
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      const auto& input = inputs.at(i);
+      // Parse op_type first, so that use i + 1
+      PyObject* obj = PyTuple_GET_ITEM(args, i + 1);
+      // Emplace Py_None from python, this means optional inputs passed to C++,
+      // use one un-initialized tensor to indicate both Tensor and
+      // vector<Tensor> inputs.
+      if (obj == Py_None) {
+        VLOG(7) << "Custom operator add input " << input
+                << " to CustomOpKernelContext. Add un-initialized tensor "
+                   "because the optional input is None";
+        ctx.EmplaceBackInput(std::move(paddle::Tensor()));
+        continue;
+      }
+      if (paddle::framework::detail::IsDuplicableVar(input)) {
+        ctx.EmplaceBackInputs(std::move(CastPyArg2VectorOfTensor(obj, i + 1)));
+        VLOG(7) << "Custom operator add input " << input
+                << " to CustomOpKernelContext. Add vector<Tensor> size = "
+                << ctx.InputRangeAt(i).second - ctx.InputRangeAt(i).first;
+      } else {
+        ctx.EmplaceBackInput(std::move(CastPyArg2Tensor(obj, i + 1)));
+        VLOG(7) << "Custom operator add input " << input
+                << " to CustomOpKernelContext. Add Tensor for general case.";
+      }
+    }
+    // Parse op_type and inputs first, so that use 1 + inputs.size() + i
+    int attr_start_idx = 1 + inputs.size();
+    for (size_t i = 0; i < attrs.size(); ++i) {
+      const auto& attr = attrs.at(i);
+      std::vector<std::string> attr_name_and_type =
+          paddle::framework::detail::ParseAttrStr(attr);
+      auto attr_type_str = attr_name_and_type[1];
+      VLOG(7) << "Custom operator add attrs " << attr_name_and_type[0]
+              << " to CustomOpKernelContext. Attribute type = "
+              << attr_type_str;
+      PyObject* obj = PyTuple_GET_ITEM(args, attr_start_idx + i);
+      if (attr_type_str == "bool") {
+        ctx.EmplaceBackAttr(CastPyArg2AttrBoolean(obj, attr_start_idx + i));
+      } else if (attr_type_str == "int") {
+        ctx.EmplaceBackAttr(CastPyArg2AttrInt(obj, attr_start_idx + i));
+      } else if (attr_type_str == "float") {
+        ctx.EmplaceBackAttr(CastPyArg2AttrFloat(obj, attr_start_idx + i));
+      } else if (attr_type_str == "int64_t") {
+        ctx.EmplaceBackAttr(CastPyArg2Long(obj, op_type, attr_start_idx + i));
+      } else if (attr_type_str == "std::string") {
+        ctx.EmplaceBackAttr(CastPyArg2AttrString(obj, attr_start_idx + i));
+      } else if (attr_type_str == "std::vector<int>") {
+        ctx.EmplaceBackAttr(CastPyArg2VectorOfInt(obj, attr_start_idx + i));
+      } else if (attr_type_str == "std::vector<float>") {
+        ctx.EmplaceBackAttr(CastPyArg2VectorOfFloat(obj, attr_start_idx + i));
+      } else if (attr_type_str == "std::vector<int64_t>") {
+        ctx.EmplaceBackAttr(CastPyArg2Longs(obj, op_type, attr_start_idx + i));
+      } else if (attr_type_str == "std::vector<std::string>") {
+        ctx.EmplaceBackAttr(CastPyArg2VectorOfString(obj, attr_start_idx + i));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported `%s` type value as custom attribute now. "
+            "Supported data types include `bool`, `int`, `float`, "
+            "`int64_t`, `std::string`, `std::vector<int>`, "
+            "`std::vector<float>`, `std::vector<int64_t>`, "
+            "`std::vector<std::string>`, Please check whether "
+            "the attribute data type and data type string are matched.",
+            attr_type_str));
+      }
+    }
+    ctx.ConstructInplaceIndex(inputs, outputs, inplace_map);
+    const auto& inplace_reverse_idx_map = ctx.GetInplaceReverseIndexMap();
+    for (size_t out_idx = 0; out_idx < outputs.size(); ++out_idx) {
+      const auto& output = outputs.at(out_idx);
+      // inplace special case
+      if (inplace_reverse_idx_map.find(out_idx) !=
+          inplace_reverse_idx_map.end()) {
+        size_t in_idx = inplace_reverse_idx_map.at(out_idx);
+        const auto& input_range = ctx.InputRangeAt(in_idx);
+        const auto& input_tensor = ctx.InputAt(input_range.first);
+        // inplace optional [Tensor or vector<Tensor>], un-initialized tensor.
+        if (paddle::framework::detail::IsOptionalVar(output) &&
+            !input_tensor.initialized()) {
+          VLOG(7) << "Custom operator add output " << output
+                  << " to CustomOpKernelContext. Add un-initialized tensor "
+                     "because the inplace optional input is None";
+          ctx.EmplaceBackOutput(std::move(paddle::Tensor()));
+          continue;
+        }
+        /// inplace vector<Tensor>, initialized tensor.
+        if (paddle::framework::detail::IsDuplicableVar(output)) {
+          std::vector<paddle::Tensor> empty_tensors;
+          size_t vector_size = input_range.second - input_range.first;
+          empty_tensors.resize(vector_size);
+          for (size_t i = 0; i < vector_size; ++i) {
+            empty_tensors[i] = InitializedEmptyTensor();
+          }
+          VLOG(7) << "Custom operator add output " << output
+                  << " to CustomOpKernelContext. Add vector<tensor> size = "
+                  << empty_tensors.size();
+          ctx.EmplaceBackOutputs(std::move(empty_tensors));
+          continue;
+        }
+      }
+      VLOG(7) << "Custom operator add output " << output
+              << " to CustomOpKernelContext. Add initialized Tensor because "
+                 "using general or inplace mechanism";
+      // general Tensor or inplace Tensor, initialized tensor.
+      ctx.EmplaceBackOutput(std::move(InitializedEmptyTensor()));
+    }
    // handle inplace map
-    ctx.MapPlainOutputs(inputs, outputs, inplace_map);
+    ctx.UpdatePlainOutputs(inputs, outputs, inplace_map);
+    VLOG(7) << "Run Kernel of Custom Op: " << op_type;
    (*paddle::OpMetaInfoHelper::GetKernelFn(vec_map[0]))(&ctx);
    ctx.AssignInplaceOutputs();
    // handle optional None output when construct backward graph
    for (size_t i = 0; i < ctx.OutputRange().size(); i++) {
      if (ctx.OutputRangeAt(i).first + 1 == ctx.OutputRangeAt(i).second) {
-        size_t idx = ctx.OutputRangeAt(i).first;
+        paddle::Tensor* out_tensor =
-        paddle::Tensor* out_tensor = ctx.MutableOutputAt(idx);
+            ctx.MutableOutputAt(ctx.OutputRangeAt(i).first);
        if (!out_tensor->initialized()) {
          PADDLE_ENFORCE(
-              outputs.at(idx).find(paddle::kOptionalSuffix) !=
+              paddle::framework::detail::IsOptionalVar(outputs.at(i)),
-                  std::string::npos,
              phi::errors::InvalidArgument(
                  "Custom operator's %d-th output is not initialized. "
                  "Please check your implementation again. If you are "
                  "using inplace optional output, then you must use "
                  "`paddle::Optional` to decorate this output",
-                  idx));
+                  i));
          // We can also consider using `autograd_meta` to tolerant nullptr.
          out_tensor->set_autograd_meta(std::make_shared<egr::AutogradMeta>());
        }
@@ -603,45 +670,37 @@ static PyObject* eager_api_run_custom_op(PyObject* self,
    }
    VLOG(7) << "Get AutogradMeta for inputs and outputs for Custom Op";
-    std::vector<std::vector<egr::AutogradMeta*>> ins_auto_grad_metas;
+    size_t slot_ins_num = ctx.InputRange().size();
-    std::vector<std::vector<egr::AutogradMeta*>> outs_auto_grad_metas;
+    size_t slot_outs_num = ctx.OutputRange().size();
-    VLOG(7) << "We got slot num of ins is: " << ctx.InputRange().size();
+    VLOG(7) << "We got slot num of ins is: " << slot_ins_num;
-    ins_auto_grad_metas.resize(ctx.InputRange().size());
+    VLOG(7) << "We got slot num of outs is: " << slot_outs_num;
-    VLOG(7) << "We got slot num of outs is: " << ctx.OutputRange().size();
+    std::vector<egr::AutogradMeta*> ins_auto_grad_metas =
-    outs_auto_grad_metas.resize(ctx.OutputRange().size());
+        egr::EagerUtils::nullable_autograd_meta(*ctx.AllMutableInput());
+    std::vector<egr::AutogradMeta*> outs_auto_grad_metas =
-    for (size_t i = 0; i < ctx.InputRange().size(); i++) {
+        egr::EagerUtils::unsafe_autograd_meta(*ctx.AllMutableOutput());
-      ins_auto_grad_metas[i] =
-          egr::EagerUtils::nullable_autograd_meta(ctx.InputsBetween(
-              ctx.InputRangeAt(i).first, ctx.InputRangeAt(i).second));
-    }
-    for (size_t i = 0; i < ctx.OutputRange().size(); i++) {
-      outs_auto_grad_metas[i] =
-          egr::EagerUtils::unsafe_autograd_meta(ctx.OutputsBetweeen(
-              ctx.OutputRangeAt(i).first, ctx.OutputRangeAt(i).second));
-    }
    bool require_any_grad = false;
-    for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) {
+    bool trace_backward = true;
+    for (size_t i = 0; i < ins_auto_grad_metas.size(); ++i) {
      require_any_grad =
          require_any_grad || egr::EagerUtils::ComputeRequireGrad(
-                                  trace_backward, &(ins_auto_grad_metas[i]));
+                                  trace_backward, ins_auto_grad_metas[i]);
    }
    // handle inplace map
-    for (size_t i = 0; i < ctx.InputRange().size(); i++) {
+    if (!inplace_map.empty()) {
-      if (inplace_map.find(inputs[i]) != inplace_map.end()) {
+      for (size_t i = 0; i < ctx.InputRange().size(); i++) {
-        size_t input_size =
+        if (inplace_map.find(inputs[i]) == inplace_map.end()) {
-            ctx.InputRangeAt(i).second - ctx.InputRangeAt(i).first;
+          continue;
-        size_t start_idx = ctx.InputRangeAt(i).first;
+        }
-        for (size_t j = 0; j < input_size; j++) {
+        const auto& input_pair = ctx.InputRangeAt(i);
-          egr::EagerUtils::CheckInplace(ctx.InputAt(start_idx + j),
+        for (size_t j = input_pair.first; j < input_pair.second; j++) {
-                                        ins_auto_grad_metas[i][j],
+          egr::EagerUtils::CheckInplace(
-                                        require_any_grad);
+              ctx.InputAt(j), ins_auto_grad_metas[j], require_any_grad);
-          if (ctx.MutableInputAt(start_idx + j).defined()) {
+          if (ctx.MutableInputAt(j).defined()) {
            // Bump Inplace Version
-            ctx.MutableInputAt(start_idx + j).bump_inplace_version();
+            ctx.MutableInputAt(j).bump_inplace_version();
-            VLOG(3) << "Custom operator: Tensor("
+            VLOG(3) << "Custom operator: Tensor(" << ctx.InputAt(j).name()
-                    << ctx.InputAt(start_idx + j).name()
                    << ") uses Inplace Strategy.";
          }
        }
@@ -651,45 +710,50 @@ static PyObject* eager_api_run_custom_op(PyObject* self,
    if (require_any_grad && (vec_map.size() > 1)) {
      VLOG(6) << " Construct Grad for Custom Op: " << op_type;
      ConstructFwdAndBwdMap(vec_map, op_type);
-      for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) {
+      for (size_t i = 0; i < outs_auto_grad_metas.size(); ++i) {
-        egr::EagerUtils::PassStopGradient(false, &(outs_auto_grad_metas[i]));
+        egr::EagerUtils::PassStopGradient(false, outs_auto_grad_metas[i]);
      }
      // Note(HongyuJia): In dygraph eager mode, CheckInplace makes sure leaf
      // nodes set stop_gradient=True. However, dygraph mode can also outputs
      // lead nodes' gradients (For example, we can get x.grad after x.add_(y)).
      // To be consistent with dygraph mode, we have to PassStopGradient for all
      // inplaced ins_auto_grad_metas.
-      std::unordered_map<size_t, size_t> inplace_tensor_map =
+      const auto& inplace_index_map = ctx.GetInplaceIndexMap();
-          ctx.GetInplaceTensorMap();
+      for (auto pair : inplace_index_map) {
-      for (auto pair : inplace_tensor_map) {
+        const auto& size_pair = ctx.InputRangeAt(pair.first);
-        egr::EagerUtils::PassStopGradient(false,
+        for (size_t i = size_pair.first; i < size_pair.second; ++i) {
-                                          &(ins_auto_grad_metas[pair.first]));
+          egr::EagerUtils::PassStopGradient(false, ins_auto_grad_metas[i]);
+        }
      }
      auto grad_node = std::make_shared<egr::RunCustomOpNode>(
-          outs_auto_grad_metas.size(), ins_auto_grad_metas.size(), op_type);
+          slot_outs_num, slot_ins_num, op_type);
-      auto slot_map =
+      const auto& slot_map =
          egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type);
      // Prepare Grad outputs
      size_t no_grad_cnt = 0;
-      for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) {
+      for (size_t i = 0; i < slot_ins_num; i++) {
        const std::vector<paddle::Tensor>& in_tensors = ctx.InputsBetween(
            ctx.InputRangeAt(i).first, ctx.InputRangeAt(i).second);
        if (slot_map[0][0].find(i) != slot_map[0][0].end()) {
-          grad_node->SetGradOutMeta(in_tensors, slot_map[0][0][i]);
+          grad_node->SetGradOutMeta(in_tensors, slot_map[0][0].at(i));
        } else {
-          grad_node->SetGradOutMeta(
+          grad_node->SetGradOutMeta(in_tensors, slot_ins_num - 1 - no_grad_cnt);
-              in_tensors, ins_auto_grad_metas.size() - 1 - no_grad_cnt);
          no_grad_cnt++;
        }
      }
      // Prepare Grad inputs with grad of fwd outputs
-      for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) {
+      for (size_t i = 0; i < slot_outs_num; i++) {
-        const std::vector<paddle::Tensor>& out_tensors = ctx.OutputsBetweeen(
+        const auto& size_pair = ctx.OutputRangeAt(i);
-            ctx.OutputRangeAt(i).first, ctx.OutputRangeAt(i).second);
+        const std::vector<paddle::Tensor>& out_tensors =
+            ctx.OutputsBetweeen(size_pair.first, size_pair.second);
-        egr::EagerUtils::SetOutRankWithSlot(&(outs_auto_grad_metas[i]), i);
+        for (size_t j = size_pair.first; j < size_pair.second; j++) {
-        egr::EagerUtils::SetHistory(&(outs_auto_grad_metas[i]), grad_node);
+          // SetOutRankWithSlot: slot_id = i, rank = j - size_pair.first
+          outs_auto_grad_metas[j]->SetSingleOutRankWithSlot(
+              i, j - size_pair.first);
+          egr::EagerUtils::SetHistory(outs_auto_grad_metas[j], grad_node);
+        }
        grad_node->SetGradInMeta(out_tensors, i);
      }
@@ -713,9 +777,8 @@ static PyObject* eager_api_run_custom_op(PyObject* self,
                                  ctx.InputRangeAt(it->first).second));
      }
-      auto attrs_names =
+      const std::vector<paddle::any>& res_attrs = ctx.Attrs();
-          paddle::OpMetaInfoHelper::GetAttrs(meta_info_map.at(op_type)[1]);
+      std::vector<paddle::any> attrs(res_attrs.size());
-      std::vector<paddle::any> attrs(attrs_names.size());
      // Prepare attrs for Grad node
      for (auto it = slot_map[0][4].begin(); it != slot_map[0][4].end(); it++) {
        VLOG(7) << "Prepare fwd attrs: " << it->first
@@ -725,7 +788,7 @@ static PyObject* eager_api_run_custom_op(PyObject* self,
      grad_node->SetAttrs(attrs);
    }
  }
-  RETURN_PY_NONE
+  return ToPyObject(*ctx.AllMutableOutput());
  EAGER_CATCH_AND_THROW_RETURN_NULL
 }

--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -56,7 +56,6 @@ extern PyTypeObject* g_cudapinnedplace_pytype;
 extern PyTypeObject* g_customplace_pytype;
 extern PyTypeObject* g_framework_tensor_pytype;
 extern PyTypeObject* g_framework_lodtensorarray_pytype;
-extern PyTypeObject* g_custom_op_kernel_ctx_pytype;
 extern PyTypeObject* g_jit_function_pytype;
 int TensorDtype2NumpyDtype(phi::DataType dtype) {
@@ -432,6 +431,54 @@ std::vector<size_t> CastPyArg2VectorOfSize_t(PyObject* obj, size_t arg_pos) {
  return result;
 }
+std::vector<float> CastPyArg2VectorOfFloat(PyObject* obj, size_t arg_pos) {
+  std::vector<float> result;
+  if (PyList_Check(obj)) {
+    Py_ssize_t len = PyList_Size(obj);
+    PyObject* item = nullptr;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyList_GetItem(obj, i);
+      if (PyObject_CheckFloatOrConvertToFloat(&item)) {
+        result.emplace_back(static_cast<float>(PyFloat_AsDouble(item)));
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "argument (position %d) must be "
+            "list of float, but got %s at pos %d",
+            arg_pos + 1,
+            reinterpret_cast<PyTypeObject*>(item->ob_type)->tp_name,
+            i));
+      }
+    }
+  } else if (PyTuple_Check(obj)) {
+    Py_ssize_t len = PyTuple_Size(obj);
+    PyObject* item = nullptr;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyTuple_GET_ITEM(obj, i);
+      if (PyObject_CheckFloatOrConvertToFloat(&item)) {
+        result.emplace_back(static_cast<float>(PyFloat_AsDouble(item)));
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "argument (position %d) must be "
+            "list of float, but got %s at pos %d",
+            arg_pos + 1,
+            reinterpret_cast<PyTypeObject*>(item->ob_type)->tp_name,
+            i));
+      }
+    }
+  } else if (obj == Py_None) {
+    return {};
+  } else if (PyObject_CheckFloatOrConvertToFloat(&obj)) {
+    return {static_cast<float>(PyFloat_AsDouble(obj))};
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "argument (position %d) must be "
+        "list of float, but got %s",
+        arg_pos + 1,
+        reinterpret_cast<PyTypeObject*>(obj->ob_type)->tp_name));
+  }
+  return result;
+}
 std::vector<std::vector<size_t>> CastPyArg2VectorOfVectorOfSize_t(
    PyObject* obj, size_t arg_pos) {
  std::vector<std::vector<size_t>> result;
@@ -602,19 +649,6 @@ std::vector<std::string> CastPyArg2VectorOfString(PyObject* obj,
  }
 }
-paddle::CustomOpKernelContext CastPyArg2CustomOpKernelContext(PyObject* obj,
-                                                              ssize_t arg_pos) {
-  if (PyObject_IsInstance(
-          obj, reinterpret_cast<PyObject*>(g_custom_op_kernel_ctx_pytype))) {
-    return ::pybind11::handle(obj).cast<paddle::CustomOpKernelContext>();
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "argument (position %d) must be CustomOpKernelContext, "
-        "but got %s",
-        arg_pos + 1,
-        reinterpret_cast<PyTypeObject*>(obj->ob_type)->tp_name));
-  }
-}
 PyObject* ToPyObject(bool value) {
  if (value) {
    Py_INCREF(Py_True);

--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -57,8 +57,6 @@ int64_t CastPyArg2AttrLong(PyObject* obj, ssize_t arg_pos);
 size_t CastPyArg2AttrSize_t(PyObject* obj, ssize_t arg_pos);
 float CastPyArg2AttrFloat(PyObject* obj, ssize_t arg_pos);
 std::string CastPyArg2AttrString(PyObject* obj, ssize_t arg_pos);
-paddle::CustomOpKernelContext CastPyArg2CustomOpKernelContext(PyObject* obj,
-                                                              ssize_t arg_pos);
 std::shared_ptr<imperative::VarBase> CastPyArg2VarBase(PyObject* obj,
                                                       ssize_t arg_pos);
 std::vector<paddle::Tensor> CastPyArg2VectorOfTensor(PyObject* obj,
@@ -70,6 +68,7 @@ std::vector<phi::DenseTensor> CastPyArg2VectorOfTensorBase(PyObject* obj,
 std::vector<int> CastPyArg2VectorOfInt(PyObject* obj, size_t arg_pos);
 std::vector<int64_t> CastPyArg2VectorOfInt64(PyObject* obj, size_t arg_pos);
 std::vector<size_t> CastPyArg2VectorOfSize_t(PyObject* obj, size_t arg_pos);
+std::vector<float> CastPyArg2VectorOfFloat(PyObject* obj, size_t arg_pos);
 std::vector<std::vector<size_t>> CastPyArg2VectorOfVectorOfSize_t(
    PyObject* obj, size_t arg_pos);
 framework::proto::VarType::Type CastPyArg2ProtoType(PyObject* obj,

--- a/paddle/fluid/pybind/op_function_common.cc
+++ b/paddle/fluid/pybind/op_function_common.cc
@@ -464,7 +464,7 @@ std::vector<int64_t> CastPyArg2Longs(PyObject* obj,
    for (Py_ssize_t i = 0; i < len; i++) {
      item = PyList_GetItem(obj, i);
      if (PyObject_CheckLongOrToLong(&item)) {
-        value.emplace_back(PyLong_AsLong(item));
+        value.emplace_back((int64_t)PyLong_AsLongLong(item));
      } else {
        PADDLE_THROW(platform::errors::InvalidArgument(
            "%s(): argument (position %d) must be "
@@ -481,7 +481,7 @@ std::vector<int64_t> CastPyArg2Longs(PyObject* obj,
    for (Py_ssize_t i = 0; i < len; i++) {
      item = PyTuple_GetItem(obj, i);
      if (PyObject_CheckLongOrToLong(&item)) {
-        value.emplace_back(PyLong_AsLong(item));
+        value.emplace_back((int64_t)PyLong_AsLongLong(item));
      } else {
        PADDLE_THROW(platform::errors::InvalidArgument(
            "%s(): argument (position %d) must be "
@@ -498,7 +498,7 @@ std::vector<int64_t> CastPyArg2Longs(PyObject* obj,
    for (Py_ssize_t i = 0; i < len; i++) {
      item = PySequence_GetItem(obj, i);
      if (PyObject_CheckLongOrToLong(&item)) {
-        value.emplace_back(PyLong_AsLong(item));
+        value.emplace_back((int64_t)PyLong_AsLongLong(item));
      } else {
        PADDLE_THROW(platform::errors::InvalidArgument(
            "%s(): argument (position %d) must be "
@@ -512,7 +512,7 @@ std::vector<int64_t> CastPyArg2Longs(PyObject* obj,
  } else if (obj == Py_None) {
    return {};
  } else if (PyObject_CheckLongOrToLong(&obj)) {
-    return {static_cast<int64_t>(PyLong_AsLong(obj))};
+    return {(int64_t)PyLong_AsLongLong(obj)};
  } else {
    PADDLE_THROW(platform::errors::InvalidArgument(
        "%s(): argument (position %d) must be "

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1013,70 +1013,6 @@ PYBIND11_MODULE(libpaddle, m) {
  m.def("_promote_types_if_complex_exists",
        &paddle::framework::PromoteTypesIfComplexExists);
-  py::class_<paddle::CustomOpKernelContext> custom_op_kernel_ctx(
-      m, "CustomOpKernelContext", R"DOC()DOC");
-  g_custom_op_kernel_ctx_pytype =
-      reinterpret_cast<PyTypeObject *>(custom_op_kernel_ctx.ptr());
-  custom_op_kernel_ctx.def(py::init<>())
-      .def("add_inputs",
-           [](paddle::CustomOpKernelContext &self, const py::handle &input) {
-             PyObject *obj = input.ptr();
-             if (PyList_Check(obj) || PyTuple_Check(obj)) {
-               self.EmplaceBackInputs(
-                   std::move(CastPyArg2VectorOfTensor(obj, 1)));
-             } else if (obj == Py_None) {
-               // Check optional Tensor, use one un-initialized tensor to
-               // indicate both Tensor and vector<Tensor> inputs
-               self.EmplaceBackInput(std::move(paddle::Tensor()));
-             } else {
-               self.EmplaceBackInput(std::move(CastPyArg2Tensor(obj, 1)));
-             }
-           })
-      .def("add_outputs",
-           [](paddle::CustomOpKernelContext &self, py::handle &outputs) {
-             PyObject *obj = outputs.ptr();
-             if (PyList_Check(obj) || PyTuple_Check(obj)) {
-               self.EmplaceBackOutputs(
-                   std::move(CastPyArg2VectorOfTensor(obj, 1)));
-             } else {
-               self.EmplaceBackOutput(std::move(CastPyArg2Tensor(obj, 1)));
-             }
-           })
-      .def("add_attr",
-           [](paddle::CustomOpKernelContext &self, bool attr) {
-             self.EmplaceBackAttr(attr);
-           })
-      .def("add_attr",
-           [](paddle::CustomOpKernelContext &self, int attr) {
-             self.EmplaceBackAttr(attr);
-           })
-      .def("add_attr",
-           [](paddle::CustomOpKernelContext &self, float attr) {
-             self.EmplaceBackAttr(attr);
-           })
-      .def("add_attr",
-           [](paddle::CustomOpKernelContext &self, int64_t attr) {
-             self.EmplaceBackAttr(attr);
-           })
-      .def("add_attr",
-           [](paddle::CustomOpKernelContext &self, const std::string &attr) {
-             self.EmplaceBackAttr(attr);
-           })
-      .def("add_attr",
-           [](paddle::CustomOpKernelContext &self,
-              const std::vector<int> &attr) { self.EmplaceBackAttr(attr); })
-      .def("add_attr",
-           [](paddle::CustomOpKernelContext &self,
-              const std::vector<float> &attr) { self.EmplaceBackAttr(attr); })
-      .def("add_attr",
-           [](paddle::CustomOpKernelContext &self,
-              const std::vector<int64_t> &attr) { self.EmplaceBackAttr(attr); })
-      .def("add_attr",
-           [](paddle::CustomOpKernelContext &self,
-              const std::vector<std::string> &attr) {
-             self.EmplaceBackAttr(attr);
-           });
  py::class_<Variable>(m, "Variable", R"DOC(Variable Class.
 All parameter, weight, gradient are variables in Paddle.

--- a/paddle/phi/api/ext/op_meta_info.h
+++ b/paddle/phi/api/ext/op_meta_info.h
@@ -119,6 +119,7 @@ class PADDLE_API CustomOpKernelContext {
  const Tensor& InputAt(size_t idx) const;
  std::vector<Tensor> InputsBetween(size_t start, size_t end) const;
  Tensor& MutableInputAt(size_t idx);
+  std::vector<Tensor>* AllMutableInput();
  paddle::optional<Tensor> OptionalInputAt(size_t idx);
  paddle::optional<std::vector<Tensor>> OptionalInputsBetween(size_t start,
                                                              size_t end);
@@ -144,13 +145,18 @@ class PADDLE_API CustomOpKernelContext {
  }
  // handle inplace map
-  void MapPlainOutputs(
+  void ConstructInplaceIndex(
+      const std::vector<std::string>& inputs,
+      const std::vector<std::string>& outputs,
+      const std::unordered_map<std::string, std::string>& inplace_map);
+  void UpdatePlainOutputs(
      const std::vector<std::string>& inputs,
      const std::vector<std::string>& outputs,
      const std::unordered_map<std::string, std::string>& inplace_map);
  void AssignInplaceOutputs();
  std::vector<Tensor*>* AllMutablePlainOutput();
-  std::unordered_map<size_t, size_t> GetInplaceTensorMap();
+  std::unordered_map<size_t, size_t> GetInplaceIndexMap();
+  std::unordered_map<size_t, size_t> GetInplaceReverseIndexMap();
 private:
  // TODO(chenweihang): replaced be SmallVector
@@ -159,7 +165,10 @@ class PADDLE_API CustomOpKernelContext {
  std::vector<paddle::any> attrs_;
  // handle inplace map
  std::vector<Tensor*> plain_outputs_;
-  std::unordered_map<size_t, size_t> inplace_tensor_map_;
+  // {input: output}
+  std::unordered_map<size_t, size_t> inplace_idx_map_;
+  // {output: input}
+  std::unordered_map<size_t, size_t> inplace_reverse_idx_map_;
  std::vector<std::pair<size_t, size_t>> input_range_;
  std::vector<std::pair<size_t, size_t>> output_range_;

--- a/paddle/phi/api/lib/op_meta_info.cc
+++ b/paddle/phi/api/lib/op_meta_info.cc
@@ -103,6 +103,10 @@ Tensor& CustomOpKernelContext::MutableInputAt(size_t idx) {
  return inputs_.at(idx);
 }
+std::vector<Tensor>* CustomOpKernelContext::AllMutableInput() {
+  return &inputs_;
+}
 paddle::optional<Tensor> CustomOpKernelContext::OptionalInputAt(size_t idx) {
  if (!inputs_.at(idx).is_initialized()) {
    return paddle::none;
@@ -156,13 +160,15 @@ const std::pair<size_t, size_t>& CustomOpKernelContext::OutputRangeAt(
  return output_range_.at(idx);
 }
-// handle inplace mechanism
+void CustomOpKernelContext::ConstructInplaceIndex(
-// Find out non-inplace output tensors.
-// TODO(HongyuJia): Add cache for inplace_tensor_map_ to optimize performance
-void CustomOpKernelContext::MapPlainOutputs(
    const std::vector<std::string>& inputs,
    const std::vector<std::string>& outputs,
    const std::unordered_map<std::string, std::string>& inplace_map) {
+  // Cache inplace indices.
+  if (inplace_map.empty() || !inplace_idx_map_.empty()) {
+    VLOG(4) << "Custom opertor ConstructInplaceIndex no need to recompute.";
+    return;
+  }
  for (size_t in_idx = 0; in_idx < inputs.size(); ++in_idx) {
    auto& input = inputs[in_idx];
    if (inplace_map.find(input) == inplace_map.end()) {
@@ -175,15 +181,26 @@ void CustomOpKernelContext::MapPlainOutputs(
                              "the input of `Inplace` again and make "
                              "sure you registered your op accurately. ",
                              input));
-    inplace_tensor_map_[in_idx] = distance(outputs.begin(), out_iter);
+    size_t out_idx = distance(outputs.begin(), out_iter);
+    inplace_idx_map_[in_idx] = out_idx;
+    inplace_reverse_idx_map_[out_idx] = in_idx;
+  }
+  VLOG(4) << "Custom opertor update inplace input-output map successfully.";
+}
+// Find out non-inplace output tensors.
+void CustomOpKernelContext::UpdatePlainOutputs(
+    const std::vector<std::string>& inputs,
+    const std::vector<std::string>& outputs,
+    const std::unordered_map<std::string, std::string>& inplace_map) {
+  // Cache plain outputs vector.
+  if (!plain_outputs_.empty()) {
+    VLOG(4) << "Custom opertor UpdatePlainOutputs no need to recompute.";
+    return;
  }
+  ConstructInplaceIndex(inputs, outputs, inplace_map);
  for (size_t i = 0; i < outputs.size(); ++i) {
-    if (std::any_of(
+    if (inplace_reverse_idx_map_.find(i) != inplace_reverse_idx_map_.end()) {
-            inplace_tensor_map_.begin(),
-            inplace_tensor_map_.end(),
-            [i](std::unordered_map<size_t, size_t>::const_reference pair) {
-              return pair.second == i;
-            })) {
      continue;
    }
    size_t output_start_idx = output_range_[i].first;
@@ -192,11 +209,12 @@ void CustomOpKernelContext::MapPlainOutputs(
      plain_outputs_.push_back(&outputs_[idx]);
    }
  }
-  VLOG(4) << "Custom opertor update inplace input-output map successfully.";
+  VLOG(4) << "Custom opertor update plain outputs map successfully.";
 }
 // Assign input tensor to inplace output tensors.
 void CustomOpKernelContext::AssignInplaceOutputs() {
-  for (auto pair : inplace_tensor_map_) {
+  for (auto pair : inplace_idx_map_) {
    size_t in_start_idx = input_range_[pair.first].first;
    size_t in_end_idx = input_range_[pair.first].second;
    size_t out_start_idx = output_range_[pair.second].first;
@@ -213,15 +231,21 @@ void CustomOpKernelContext::AssignInplaceOutputs() {
    }
    VLOG(4) << "Custom opertor update inplace input-output tensor "
               "successfully. Update map size = "
-            << inplace_tensor_map_.size();
+            << inplace_idx_map_.size();
  }
 }
 std::vector<Tensor*>* CustomOpKernelContext::AllMutablePlainOutput() {
  return &plain_outputs_;
 }
+std::unordered_map<size_t, size_t> CustomOpKernelContext::GetInplaceIndexMap() {
+  return inplace_idx_map_;
+}
 std::unordered_map<size_t, size_t>
-CustomOpKernelContext::GetInplaceTensorMap() {
+CustomOpKernelContext::GetInplaceReverseIndexMap() {
-  return inplace_tensor_map_;
+  return inplace_reverse_idx_map_;
 }
 ////////////////////// Op Meta Info //////////////////////

--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -1042,7 +1042,9 @@ def _gen_output_content(
    # ' ' * tab space * tab number
    indent = ' ' * 4 * 2
    inplace_idx = {v: k for k, v in inplace_reverse_idx.items()}
-    dynamic_content = ""
+    dynamic_content = f"""
+{indent}res = []
+{indent}start_idx = 0"""
    static_content = f"""
 {indent}ins = {{}}
 {indent}ins_map = {ins_map}
@@ -1065,10 +1067,11 @@ def _gen_output_content(
            lower_in_names = in_names[in_idx].split("@")[0].lower()
            dynamic_content += f"""
 {indent}if {lower_in_names} is not None:
-{indent}    outs['{out_name}'] = [core.eager.Tensor() for _ in range(len({lower_in_names}))]
+{indent}    res.append(outs[start_idx: start_idx + len({lower_in_names})])
+{indent}    start_idx += len({lower_in_names})
 {indent}else:
-{indent}    outs['{out_name}'] = core.eager.Tensor()
+{indent}    res.append(None)
-{indent}ctx.add_outputs(outs['{out_name}'])"""
+{indent}    start_idx += 1"""
            static_content += f"""
 {indent}if {lower_in_names} is not None:
 {indent}    outs['{out_name}'] = [helper.create_variable(dtype='float32') for _ in range(len({lower_in_names}))]"""
@@ -1077,8 +1080,8 @@ def _gen_output_content(
        ):  # inplace vector<Tensor> output case
            lower_in_names = in_names[in_idx].split("@")[0].lower()
            dynamic_content += f"""
-{indent}outs['{out_name}'] = [core.eager.Tensor() for _ in range(len({lower_in_names}))]
+{indent}res.append(outs[start_idx: start_idx + len({lower_in_names})])
-{indent}ctx.add_outputs(outs['{out_name}'])"""
+{indent}start_idx += len({lower_in_names})"""
            static_content += f"""
 {indent}outs['{out_name}'] = [helper.create_variable(dtype='float32') for _ in range(len({lower_in_names}))]"""
        elif (
@@ -1086,21 +1089,22 @@ def _gen_output_content(
        ):  # inplace optional Tensor output case, handle inplace None input
            lower_in_names = in_names[in_idx].split("@")[0].lower()
            dynamic_content += f"""
-{indent}outs['{out_name}'] = core.eager.Tensor()
+{indent}if {lower_in_names} is not None:
-{indent}ctx.add_outputs(outs['{out_name}'])"""
+{indent}    res.append(outs[start_idx])
+{indent}else:
+{indent}    res.append(None)
+{indent}start_idx += 1"""
            static_content += f"""
 {indent}if {lower_in_names} is not None:
 {indent}    outs['{out_name}'] = helper.create_variable(dtype='float32')"""
        else:  # general/inplace Tensor output case
            dynamic_content += f"""
-{indent}outs['{out_name}'] = core.eager.Tensor()
+{indent}res.append(outs[start_idx])
-{indent}ctx.add_outputs(outs['{out_name}'])"""
+{indent}start_idx += 1"""
            static_content += f"""
 {indent}outs['{out_name}'] = helper.create_variable(dtype='float32')"""
    dynamic_content += f"""
-{indent}core.eager._run_custom_op(ctx, "{op_name}", True)
-{indent}res = [outs[out_name] if isinstance(outs[out_name], list) or outs[out_name]._is_initialized() else None for out_name in outs_list]
 {indent}return res[0] if len(res)==1 else res"""
    static_content += f"""
@@ -1134,7 +1138,7 @@ def _custom_api_content(op_name):
    API_TEMPLATE = textwrap.dedent(
        """
        import paddle.fluid.core as core
-        from paddle.fluid.core import Tensor, CustomOpKernelContext
+        from paddle.fluid.core import Tensor
        from paddle.fluid.framework import _dygraph_tracer, in_dygraph_mode
        from paddle.fluid.layer_helper import LayerHelper
@@ -1146,11 +1150,7 @@ def _custom_api_content(op_name):
            # The output variable's dtype use default value 'float32',
            # and the actual dtype of output variable will be inferred in runtime.
            if in_dygraph_mode():
-                ctx = CustomOpKernelContext()
+                outs = core.eager._run_custom_op("{op_name}", {params_list})
-                for i in {in_names}:
-                    ctx.add_inputs(i)
-                for j in {attr_names}:
-                    ctx.add_attr(j)
                {dynamic_content}
            else:
                {static_content}