diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 2fc2deb087e89a52b3e451e30e80a0cd7cc671e0..e0a80d3c79854301fe55e63fc4655fe76cdd9caf 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1131,7 +1131,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   // phase
   if (FLAGS_run_pten_kernel &&
       pten::KernelFactory::Instance().HasCompatiblePtenKernel(type_)) {
-    if (pt_kernel_signature_.get() == nullptr || pt_kernel_.get() == nullptr) {
+    if (pt_kernel_signature_ == nullptr || pt_kernel_ == nullptr) {
       ChoosePtenKernel(exe_ctx);
     }
     run_pten_kernel_ = pt_kernel_->IsValid();
@@ -1178,8 +1178,12 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     platform::RecordEvent record_event("compute",
                                        platform::EventRole::kInnerOp);
     if (run_pten_kernel_) {
-      auto op_kernel_ctx = BuildPtenKernelContext(*runtime_ctx, *dev_ctx);
-      (*pt_kernel_)(&op_kernel_ctx);
+      if (pt_kernel_context_ == nullptr) {
+        pt_kernel_context_.reset(new pten::KernelContext());
+      }
+      BuildPtenKernelContext(*runtime_ctx, dev_ctx);
+      (*pt_kernel_)(pt_kernel_context_.get());
+      pt_kernel_context_->ClearData();
     } else {
       (*kernel_func_)(
           ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx));
@@ -1765,8 +1769,8 @@ KernelSignature OperatorWithKernel::GetExpectedPtenKernelArgs(
   return KernelSignatureMap::Instance().Get(Type());
 }
 
-pten::KernelContext OperatorWithKernel::BuildPtenKernelContext(
-    const RuntimeContext& ctx, const platform::DeviceContext& dev_ctx) const {
+void OperatorWithKernel::BuildPtenKernelContext(
+    const RuntimeContext& ctx, platform::DeviceContext* dev_ctx) const {
   // TODO(chenweihang): now only work for very simple case,
   // many cases need to be deal with later:
   // 1. the input and output are not tensor
@@ -1774,7 +1778,7 @@ pten::KernelContext OperatorWithKernel::BuildPtenKernelContext(
   // 3. needless attributes remove
   // 4. use pt Tensor directly
   // 5. kernel input is not DenseTensor
-  pten::KernelContext op_kernel_ctx(dev_ctx);
+  pt_kernel_context_->SetDeviceContext(dev_ctx);
 
   auto& input_names = std::get<0>(pt_kernel_signature_->args);
   auto& attr_names = std::get<1>(pt_kernel_signature_->args);
@@ -1803,30 +1807,53 @@ pten::KernelContext OperatorWithKernel::BuildPtenKernelContext(
                         attr_names.size(), attr_defs.size()));
 
   for (size_t i = 0; i < input_names.size(); ++i) {
-    auto in_def = input_defs.at(i);
-    VLOG(2) << "in_def: " << in_def.backend << ", " << in_def.dtype << ", "
-            << in_def.layout;
-
-    auto ins_vector = ctx.inputs.at(input_names[i]);
-
-    paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_inputs;
-    for (auto var : ins_vector) {
-      tmp_inputs.emplace_back(
-          experimental::MakePtenTensorBaseFromVar(*var, in_def));
+    auto& in_def = input_defs.at(i);
+    auto& ins_vector = ctx.inputs.at(input_names[i]);
+    if (pt_kernel_context_->InputsSize() <= i) {
+      paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_inputs;
+      for (auto* var : ins_vector) {
+        tmp_inputs.emplace_back(
+            experimental::MakePtenTensorBaseFromVar(*var, in_def));
+      }
+      pt_kernel_context_->EmplaceBackInputs(std::move(tmp_inputs));
+    } else {
+      size_t input_size = pt_kernel_context_->InputsSize();
+      for (size_t j = 0; j < ins_vector.size(); ++j) {
+        if (input_size > i + j) {
+          experimental::ReMakePtenDenseTensorFromVar(
+              *ins_vector[j], in_def,
+              pt_kernel_context_->MutableInputAt<pten::DenseTensor>(i + j));
+        }
+        // TODO(chenweihang): adapt multi-input case later
+      }
+      pt_kernel_context_->MutableInputRangeAt(i) =
+          std::make_pair(i, i + ins_vector.size());
     }
-    op_kernel_ctx.EmplaceBackInputs(std::move(tmp_inputs));
   }
 
   for (size_t i = 0; i < output_names.size(); ++i) {
-    auto out_def = output_defs.at(i);
-    auto outs_vector = ctx.outputs.at(output_names[i]);
-
-    paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_outputs;
-    for (auto var : outs_vector) {
-      tmp_outputs.emplace_back(
-          experimental::MakePtenTensorBaseFromVar(var, out_def));
+    auto& out_def = output_defs.at(i);
+    auto& outs_vector = ctx.outputs.at(output_names[i]);
+    if (pt_kernel_context_->OutputsSize() <= i) {
+      paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_outputs;
+      for (auto* var : outs_vector) {
+        tmp_outputs.emplace_back(
+            experimental::MakePtenTensorBaseFromVar(var, out_def));
+      }
+      pt_kernel_context_->EmplaceBackOutputs(std::move(tmp_outputs));
+    } else {
+      size_t output_size = pt_kernel_context_->OutputsSize();
+      for (size_t j = 0; j < outs_vector.size(); ++j) {
+        if (output_size > i + j) {
+          experimental::ReMakePtenDenseTensorFromVar(
+              outs_vector[j], out_def,
+              pt_kernel_context_->MutableOutputAt<pten::DenseTensor>(i + j));
+        }
+        // TODO(chenweihang): adapt multi-output case later
+      }
+      pt_kernel_context_->MutableOutputRangeAt(i) =
+          std::make_pair(i, i + outs_vector.size());
     }
-    op_kernel_ctx.EmplaceBackOutputs(std::move(tmp_outputs));
   }
 
   for (size_t i = 0; i < attr_names.size(); ++i) {
@@ -1836,11 +1863,11 @@ pten::KernelContext OperatorWithKernel::BuildPtenKernelContext(
       // TODO(zhangyunfei): Scalar should hold scaler type, and we should check
       // attribtue type by attr_defs
       if (std::type_index(attr.type()) == std::type_index(typeid(float))) {
-        op_kernel_ctx.EmplaceBackAttr(
+        pt_kernel_context_->EmplaceBackAttr(
             std::move(pten::Scalar(BOOST_GET_CONST(float, attr))));
       } else if (std::type_index(attr.type()) ==
                  std::type_index(typeid(std::string))) {
-        op_kernel_ctx.EmplaceBackAttr(
+        pt_kernel_context_->EmplaceBackAttr(
             std::move(pten::Scalar(BOOST_GET_CONST(std::string, attr))));
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
@@ -1851,11 +1878,11 @@ pten::KernelContext OperatorWithKernel::BuildPtenKernelContext(
     } else {
       // TODO(chenweihang): support other attrs later
       if (attr_defs[i].type_index == std::type_index(typeid(int))) {
-        op_kernel_ctx.EmplaceBackAttr(BOOST_GET_CONST(int, attr));
+        pt_kernel_context_->EmplaceBackAttr(BOOST_GET_CONST(int, attr));
       } else if (attr_defs[i].type_index == std::type_index(typeid(float))) {
-        op_kernel_ctx.EmplaceBackAttr(BOOST_GET_CONST(float, attr));
+        pt_kernel_context_->EmplaceBackAttr(BOOST_GET_CONST(float, attr));
       } else if (attr_defs[i].type_index == std::type_index(typeid(bool))) {
-        op_kernel_ctx.EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
+        pt_kernel_context_->EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "unsupported cast op attribute `%s` when construct "
@@ -1864,8 +1891,6 @@ pten::KernelContext OperatorWithKernel::BuildPtenKernelContext(
       }
     }
   }
-
-  return op_kernel_ctx;
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index b75dcade6fccf703b36e705b6171b65d93f0d723..4c071b777fe8359cba4276dc53ca690df6d1c1de 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -586,8 +586,8 @@ class OperatorWithKernel : public OperatorBase {
   /* member functions for adapting to pten lib */
   void ChoosePtenKernel(const ExecutionContext& ctx) const;
 
-  pten::KernelContext BuildPtenKernelContext(
-      const RuntimeContext& ctx, const platform::DeviceContext& dev_ctx) const;
+  void BuildPtenKernelContext(const RuntimeContext& ctx,
+                              platform::DeviceContext* dev_ctx) const;
 
  protected:
   mutable std::unique_ptr<OpKernelType> kernel_type_;
@@ -605,6 +605,9 @@ class OperatorWithKernel : public OperatorBase {
   mutable bool run_pten_kernel_ = false;
   mutable std::unique_ptr<KernelSignature> pt_kernel_signature_;
   mutable std::unique_ptr<pten::Kernel> pt_kernel_;
+  // In order to reduce the compatibility phase
+  // performance overhead, temporarily cache KernelContext
+  mutable std::unique_ptr<pten::KernelContext> pt_kernel_context_;
 };
 
 extern bool OpSupportGPU(const std::string& op_type);
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index c45f92496b3e827fc6f6f342f72da90afee6930e..8f196636af4894deee2044586fb7903e2780ba5a 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -1,14 +1,13 @@
 cc_library(imperative_flag SRCS flags.cc DEPS gflags flags)
-
 IF(WITH_XPU)
-cc_library(prepared_operator SRCS prepared_operator.cc DEPS xpu_op_list proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils pten_utils)
+cc_library(prepared_operator SRCS prepared_operator.cc DEPS xpu_op_list proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils pten pten_utils)
 ELSE()
-cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils pten_utils)
+cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils pten pten_utils)
 ENDIF()
 cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry)
 add_subdirectory(jit)
 cc_library(amp SRCS amp_auto_cast.cc DEPS layer )
-cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal)
+cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal garbage_collector)
 cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator)
 cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator)
 cc_library(imperative_profiler SRCS profiler.cc DEPS flags)
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 53ae5b8127fdba5dd68ddc6748dc35e9fe7ae8ec..b584b928f96b9733bdeb36821092944323520f4b 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -356,6 +356,8 @@ void VarBase::BumpInplaceVersion() {
   MutableVar()->BumpInplaceVersion();
 }
 
+pten::KernelContext OpBase::pt_kernel_context_;
+
 void OpBase::SetType(const std::string& type) {
   op_ = framework::OpRegistry::CreateOp(type, {}, {}, {}, false);
 }
@@ -371,7 +373,8 @@ static void OpBaseRunImpl(const framework::OperatorBase& op,
                           const NameVarMap<VarType>& outs,
                           const framework::AttributeMap& attrs,
                           const framework::AttributeMap& default_attrs,
-                          const platform::Place& place) {
+                          const platform::Place& place,
+                          pten::KernelContext* pt_kernel_context) {
   auto* op_kernel = dynamic_cast<const framework::OperatorWithKernel*>(&op);
   PADDLE_ENFORCE_NOT_NULL(
       op_kernel, platform::errors::PermissionDenied(
@@ -412,8 +415,8 @@ static void OpBaseRunImpl(const framework::OperatorBase& op,
    * after the execution of op, but the original input is directly
    * overwritten in the previous dynamic graph implemention.
    */
-  auto prepared_op =
-      PreparedOp::Prepare(ins, outs, *op_kernel, place, attrs, default_attrs);
+  auto prepared_op = PreparedOp::Prepare(ins, outs, *op_kernel, place, attrs,
+                                         default_attrs, pt_kernel_context);
   auto tmp_ins_ptr =
       PrepareData<VarType>(*op_kernel, ins, prepared_op.kernel_type());
   if (tmp_ins_ptr == nullptr) {
@@ -441,7 +444,8 @@ void OpBase::Run(const framework::OperatorBase& op,
                  const framework::AttributeMap& attrs,
                  const framework::AttributeMap& default_attrs,
                  const platform::Place& place) {
-  OpBaseRunImpl<VarBase>(op, ins, outs, attrs, default_attrs, place);
+  OpBaseRunImpl<VarBase>(op, ins, outs, attrs, default_attrs, place,
+                         &pt_kernel_context_);
 }
 
 void OpBase::Run(const framework::OperatorBase& op,
@@ -450,7 +454,8 @@ void OpBase::Run(const framework::OperatorBase& op,
                  const framework::AttributeMap& attrs,
                  const framework::AttributeMap& default_attrs,
                  const platform::Place& place) {
-  OpBaseRunImpl<VariableWrapper>(op, ins, outs, attrs, default_attrs, place);
+  OpBaseRunImpl<VariableWrapper>(op, ins, outs, attrs, default_attrs, place,
+                                 &pt_kernel_context_);
 }
 
 void ClearNoNeedBufferInputs(OpBase* op) {
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 16580627ed1964c6cfc81a48b15f26d0b2459a78..9108155a043b7a56ca7db608a601cfa6c3b8d714 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -36,6 +36,7 @@
 #include "paddle/fluid/imperative/variable_wrapper.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/pten/include/core.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/imperative/op_base.h b/paddle/fluid/imperative/op_base.h
index acb125a82925d7971b7b03ee90198f87c1a5b9c0..4122e2af3dedaee0b0dfd74923870b7137fe73a3 100644
--- a/paddle/fluid/imperative/op_base.h
+++ b/paddle/fluid/imperative/op_base.h
@@ -25,6 +25,7 @@
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/imperative/variable_wrapper.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/pten/include/core.h"
 
 namespace paddle {
 namespace imperative {
@@ -183,6 +184,8 @@ class OpBase {
                   const framework::AttributeMap& default_attrs,
                   const platform::Place& place);
 
+  static pten::KernelContext* GetKernelContext() { return &pt_kernel_context_; }
+
  private:
   static const std::string& UnknownOpType() {
     static std::string kUnknownOpType{"unknown"};
@@ -197,6 +200,9 @@ class OpBase {
   std::unique_ptr<framework::OperatorBase> op_;
   platform::Place place_;
   size_t id_{-1UL};
+  // In order to reduce the compatibility phase
+  // performance overhead, temporarily cache KernelContext
+  static pten::KernelContext pt_kernel_context_;
 };
 
 class GradOpNode {
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 7c0aaed25ab14fdfd977355fcde49877a54e1e86..c9e211809a4064fa25da8b5825bd792f7318ec96 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -17,6 +17,7 @@
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/imperative/infer_shape_context.h"
+#include "paddle/fluid/imperative/tracer.h"
 #include "paddle/pten/common/scalar.h"
 #include "paddle/utils/small_vector.h"
 #ifdef PADDLE_WITH_XPU
@@ -112,6 +113,7 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op,
                        const framework::OpKernelType& kernel_type,
                        const framework::KernelSignature& kernel_signature,
                        const pten::Kernel& pt_kernel,
+                       pten::KernelContext* pt_kernel_context,
                        platform::DeviceContext* dev_ctx)
     : op_(op),
       ctx_(ctx),
@@ -120,7 +122,8 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op,
       dev_ctx_(dev_ctx),
       run_pten_kernel_(true),
       pt_kernel_signature_(kernel_signature),
-      pt_kernel_(pt_kernel) {}
+      pt_kernel_(pt_kernel),
+      pt_kernel_context_(pt_kernel_context) {}
 
 template <typename VarType>
 PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
@@ -128,7 +131,8 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
                        const framework::OperatorWithKernel& op,
                        const platform::Place& place,
                        const framework::AttributeMap& attrs,
-                       const framework::AttributeMap& default_attrs) {
+                       const framework::AttributeMap& default_attrs,
+                       pten::KernelContext* pt_kernel_context) {
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto* dev_ctx = pool.Get(place);
 
@@ -171,7 +175,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
 
       // TODO(chenweihang): using CPUKernel when miss device kernel case
       return PreparedOp(op, ctx, expected_kernel_key, pt_kernel_signature,
-                        pt_kernel, dev_ctx);
+                        pt_kernel, pt_kernel_context, dev_ctx);
     } else {
       VLOG(1) << "Dynamic mode ChoosePtenKernel - kernel `" << pt_kernel_name
               << "` not found.";
@@ -230,8 +234,10 @@ PreparedOp PreparedOp::Prepare(const NameVarMap<VarBase>& ins,
                                const framework::OperatorWithKernel& op,
                                const platform::Place& place,
                                const framework::AttributeMap& attrs,
-                               const framework::AttributeMap& default_attrs) {
-  return PrepareImpl<VarBase>(ins, outs, op, place, attrs, default_attrs);
+                               const framework::AttributeMap& default_attrs,
+                               pten::KernelContext* pt_kernel_context) {
+  return PrepareImpl<VarBase>(ins, outs, op, place, attrs, default_attrs,
+                              pt_kernel_context);
 }
 
 PreparedOp PreparedOp::Prepare(const NameVarMap<VariableWrapper>& ins,
@@ -239,18 +245,19 @@ PreparedOp PreparedOp::Prepare(const NameVarMap<VariableWrapper>& ins,
                                const framework::OperatorWithKernel& op,
                                const platform::Place& place,
                                const framework::AttributeMap& attrs,
-                               const framework::AttributeMap& default_attrs) {
+                               const framework::AttributeMap& default_attrs,
+                               pten::KernelContext* pt_kernel_context) {
   return PrepareImpl<VariableWrapper>(ins, outs, op, place, attrs,
-                                      default_attrs);
+                                      default_attrs, pt_kernel_context);
 }
 
 template <typename VarType>
-static pten::KernelContext BuildDygraphPtenKernelContext(
+static void BuildDygraphPtenKernelContext(
     const framework::KernelSignature& pt_kernel_signature,
     const pten::Kernel& pt_kernel, const NameVarMap<VarType>& ins,
     const NameVarMap<VarType>& outs, const framework::AttributeMap& attrs,
     const framework::AttributeMap& default_attrs,
-    const platform::DeviceContext& dev_ctx) {
+    platform::DeviceContext* dev_ctx, pten::KernelContext* kernel_ctx) {
   // TODO(chenweihang): now only work for very simple case,
   // many cases need to be deal with later:
   // 1. the input and output are not tensor
@@ -258,7 +265,7 @@ static pten::KernelContext BuildDygraphPtenKernelContext(
   // 3. needless attributes remove
   // 4. use pt Tensor directly
   // 5. kernel input is not DenseTensor
-  pten::KernelContext op_kernel_ctx(dev_ctx);
+  kernel_ctx->SetDeviceContext(dev_ctx);
 
   auto& input_names = std::get<0>(pt_kernel_signature.args);
   auto& attr_names = std::get<1>(pt_kernel_signature.args);
@@ -289,27 +296,53 @@ static pten::KernelContext BuildDygraphPtenKernelContext(
   for (size_t i = 0; i < input_names.size(); ++i) {
     auto& in_def = input_defs.at(i);
     auto& ins_vector = ins.at(input_names[i]);
-
-    paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_inputs;
-    for (auto var : ins_vector) {
-      const auto& variable = var->Var();
-      tmp_inputs.emplace_back(
-          experimental::MakePtenTensorBaseFromVar(variable, in_def));
+    if (kernel_ctx->InputsSize() <= i) {
+      paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_inputs;
+      for (const auto& var : ins_vector) {
+        const auto& variable = var->Var();
+        tmp_inputs.emplace_back(
+            experimental::MakePtenTensorBaseFromVar(variable, in_def));
+      }
+      kernel_ctx->EmplaceBackInputs(std::move(tmp_inputs));
+    } else {
+      size_t input_size = kernel_ctx->InputsSize();
+      for (size_t j = 0; j < ins_vector.size(); ++j) {
+        if (input_size > i + j) {
+          experimental::ReMakePtenDenseTensorFromVar(
+              ins_vector[j]->Var(), in_def,
+              kernel_ctx->MutableInputAt<pten::DenseTensor>(i + j));
+        }
+        // TODO(chenweihang): adapt multi-input case later
+      }
+      kernel_ctx->MutableInputRangeAt(i) =
+          std::make_pair(i, i + ins_vector.size());
     }
-    op_kernel_ctx.EmplaceBackInputs(std::move(tmp_inputs));
   }
 
   for (size_t i = 0; i < output_names.size(); ++i) {
     auto& out_def = output_defs.at(i);
     auto& outs_vector = outs.at(output_names[i]);
-
-    paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_outputs;
-    for (auto var : outs_vector) {
-      auto* variable = var->MutableVar();
-      tmp_outputs.emplace_back(
-          experimental::MakePtenTensorBaseFromVar(variable, out_def));
+    if (kernel_ctx->OutputsSize() <= i) {
+      paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_outputs;
+      for (auto& var : outs_vector) {
+        auto* variable = var->MutableVar();
+        tmp_outputs.emplace_back(
+            experimental::MakePtenTensorBaseFromVar(variable, out_def));
+      }
+      kernel_ctx->EmplaceBackOutputs(std::move(tmp_outputs));
+    } else {
+      size_t output_size = kernel_ctx->OutputsSize();
+      for (size_t j = 0; j < outs_vector.size(); ++j) {
+        if (output_size > i + j) {
+          experimental::ReMakePtenDenseTensorFromVar(
+              outs_vector[j]->MutableVar(), out_def,
+              kernel_ctx->MutableOutputAt<pten::DenseTensor>(i + j));
+        }
+        // TODO(chenweihang): adapt multi-output case later
+      }
+      kernel_ctx->MutableOutputRangeAt(i) =
+          std::make_pair(i, i + outs_vector.size());
     }
-    op_kernel_ctx.EmplaceBackOutputs(std::move(tmp_outputs));
   }
 
   for (size_t i = 0; i < attr_names.size(); ++i) {
@@ -319,11 +352,11 @@ static pten::KernelContext BuildDygraphPtenKernelContext(
       // TODO(zhangyunfei): Scalar should hold scaler type, and we should check
       // attribtue type by attr_defs
       if (std::type_index(attr.type()) == std::type_index(typeid(float))) {
-        op_kernel_ctx.EmplaceBackAttr(
+        kernel_ctx->EmplaceBackAttr(
             std::move(pten::Scalar(BOOST_GET_CONST(float, attr))));
       } else if (std::type_index(attr.type()) ==
                  std::type_index(typeid(std::string))) {
-        op_kernel_ctx.EmplaceBackAttr(
+        kernel_ctx->EmplaceBackAttr(
             std::move(pten::Scalar(BOOST_GET_CONST(std::string, attr))));
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
@@ -334,11 +367,11 @@ static pten::KernelContext BuildDygraphPtenKernelContext(
     } else {
       // TODO(chenweihang): support other attrs later
       if (attr_defs[i].type_index == std::type_index(typeid(int))) {
-        op_kernel_ctx.EmplaceBackAttr(BOOST_GET_CONST(int, attr));
+        kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(int, attr));
       } else if (attr_defs[i].type_index == std::type_index(typeid(float))) {
-        op_kernel_ctx.EmplaceBackAttr(BOOST_GET_CONST(float, attr));
+        kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(float, attr));
       } else if (attr_defs[i].type_index == std::type_index(typeid(bool))) {
-        op_kernel_ctx.EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
+        kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "unsupported cast op attribute `%s` when construct "
@@ -347,8 +380,6 @@ static pten::KernelContext BuildDygraphPtenKernelContext(
       }
     }
   }
-
-  return op_kernel_ctx;
 }
 
 template <typename VarType>
@@ -409,20 +440,23 @@ template <typename VarType>
 static void PreparedOpRunPtImpl(
     const framework::OperatorBase& op,
     const framework::KernelSignature& pt_kernel_signature,
-    const pten::Kernel& pt_kernel, platform::DeviceContext* dev_ctx,
-    const NameVarMap<VarType>& ins, const NameVarMap<VarType>& outs,
-    const framework::AttributeMap& attrs,
+    const pten::Kernel& pt_kernel, pten::KernelContext* pt_kernel_context,
+    platform::DeviceContext* dev_ctx, const NameVarMap<VarType>& ins,
+    const NameVarMap<VarType>& outs, const framework::AttributeMap& attrs,
     const framework::AttributeMap& default_attrs) {
   DygraphInferShapeContext<VarType> infer_shape_ctx(&ins, &outs, &attrs,
                                                     &default_attrs, op.Type());
   static_cast<const framework::OperatorWithKernel&>(op).InferShape(
       &infer_shape_ctx);
 
-  auto op_kernel_ctx = BuildDygraphPtenKernelContext<VarType>(
-      pt_kernel_signature, pt_kernel, ins, outs, attrs, default_attrs,
-      *dev_ctx);
+  BuildDygraphPtenKernelContext<VarType>(pt_kernel_signature, pt_kernel, ins,
+                                         outs, attrs, default_attrs, dev_ctx,
+                                         pt_kernel_context);
+
+  pt_kernel(pt_kernel_context);
 
-  pt_kernel(&op_kernel_ctx);
+  // Ensure that it does not affect the VarBase life cycle management
+  pt_kernel_context->ClearData();
 
   // TODO(chenweihang): add debug flags later
   // TODO(chenweihang): deal with complex cases later
@@ -434,7 +468,8 @@ void PreparedOp::Run(const NameVarMap<VarBase>& ins,
                      const framework::AttributeMap& default_attrs) {
   if (run_pten_kernel_) {
     PreparedOpRunPtImpl<VarBase>(op_, pt_kernel_signature_, pt_kernel_,
-                                 dev_ctx_, ins, outs, attrs, default_attrs);
+                                 pt_kernel_context_, dev_ctx_, ins, outs, attrs,
+                                 default_attrs);
   } else {
     PreparedOpRunImpl<VarBase>(op_, ctx_, kernel_type_, func_, dev_ctx_, ins,
                                outs, attrs, default_attrs);
@@ -447,8 +482,8 @@ void PreparedOp::Run(const NameVarMap<VariableWrapper>& ins,
                      const framework::AttributeMap& default_attrs) {
   if (run_pten_kernel_) {
     PreparedOpRunPtImpl<VariableWrapper>(op_, pt_kernel_signature_, pt_kernel_,
-                                         dev_ctx_, ins, outs, attrs,
-                                         default_attrs);
+                                         pt_kernel_context_, dev_ctx_, ins,
+                                         outs, attrs, default_attrs);
   } else {
     PreparedOpRunImpl<VariableWrapper>(op_, ctx_, kernel_type_, func_, dev_ctx_,
                                        ins, outs, attrs, default_attrs);
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 144f921861f9e1c0d4ace3a2d2ae089425e1e80b..5262b265b1b5397216d6b16abddac8c880acc3f9 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -155,21 +155,25 @@ class PreparedOp {
              const framework::RuntimeContext& ctx,
              const framework::OpKernelType& kernel_type,
              const framework::KernelSignature& kernel_signature,
-             const pten::Kernel& pt_kernel, platform::DeviceContext* dev_ctx);
+             const pten::Kernel& pt_kernel,
+             pten::KernelContext* pt_kernel_context,
+             platform::DeviceContext* dev_ctx);
 
   static PreparedOp Prepare(const NameVarMap<VarBase>& ins,
                             const NameVarMap<VarBase>& outs,
                             const framework::OperatorWithKernel& op,
                             const platform::Place& place,
                             const framework::AttributeMap& attrs,
-                            const framework::AttributeMap& default_attrs);
+                            const framework::AttributeMap& default_attrs,
+                            pten::KernelContext* pt_kernel_context = nullptr);
 
   static PreparedOp Prepare(const NameVarMap<VariableWrapper>& ins,
                             const NameVarMap<VariableWrapper>& outs,
                             const framework::OperatorWithKernel& op,
                             const platform::Place& place,
                             const framework::AttributeMap& attrs,
-                            const framework::AttributeMap& default_attrs);
+                            const framework::AttributeMap& default_attrs,
+                            pten::KernelContext* pt_kernel_context = nullptr);
 
   void Run(const NameVarMap<VarBase>& in, const NameVarMap<VarBase>& out,
            const framework::AttributeMap& attrs,
@@ -194,6 +198,9 @@ class PreparedOp {
   bool run_pten_kernel_{false};
   framework::KernelSignature pt_kernel_signature_;
   pten::Kernel pt_kernel_;
+  // In order to reduce the compatibility phase
+  // performance overhead, temporarily cache KernelContext
+  pten::KernelContext* pt_kernel_context_;
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 0f363d0ea1bff87c05c15912134e9c01bada521e..1d06a63e38f8d1ec4ed52b158fbfd62c135ac59c 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -213,6 +213,8 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
     OpBase::Run(*op, new_ins, outs, attrs, default_attrs, place);
   } catch (platform::EnforceNotMet& exception) {
     framework::AppendErrorOpHint(type, &exception);
+    // Compatible impl: clear pten kernel context data when throw error
+    OpBase::GetKernelContext()->ClearData();
     throw std::move(exception);
   } catch (std::exception& ex) {
     PADDLE_THROW(platform::errors::Fatal(
diff --git a/paddle/pten/api/lib/creation.cc b/paddle/pten/api/lib/creation.cc
index 047b19010a26c99b05de84cbef6fe69c06f73f6a..e2cd611dbda5f53e81e75626be04ce64f41f4a71 100644
--- a/paddle/pten/api/lib/creation.cc
+++ b/paddle/pten/api/lib/creation.cc
@@ -38,7 +38,7 @@ Tensor full(const std::vector<int64_t>& shape,
 
   // 2. Get Device Context
   auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(*dev_ctx);
+  auto kernel_context = pten::KernelContext(dev_ctx);
 
   // 3. Auto data transform
   kernel_context.EmplaceBackAttr(value);
@@ -75,7 +75,7 @@ Tensor full_like(const Tensor& x,
 
   // 2. Get Device Context
   auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(*dev_ctx);
+  auto kernel_context = pten::KernelContext(dev_ctx);
 
   // 3. Auto data transform
   auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
diff --git a/paddle/pten/api/lib/linalg.cc b/paddle/pten/api/lib/linalg.cc
index 587b9cd0f2726fc80d7674a8faa1f222c5e5d2fc..0ede7b8a68b416c0101579321e4fd507a69dc897 100644
--- a/paddle/pten/api/lib/linalg.cc
+++ b/paddle/pten/api/lib/linalg.cc
@@ -38,7 +38,7 @@ Tensor dot(const Tensor& x, const Tensor& y) {
 
   // 2. Get Device Context
   auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(*dev_ctx);
+  auto kernel_context = pten::KernelContext(dev_ctx);
 
   // 3. Auto data transform
   auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
@@ -76,7 +76,7 @@ Tensor matmul(const Tensor& x,
 
   // 2. Get Device Context
   auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(*dev_ctx);
+  auto kernel_context = pten::KernelContext(dev_ctx);
 
   // 3. Auto data transform
   auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
diff --git a/paddle/pten/api/lib/manipulation.cc b/paddle/pten/api/lib/manipulation.cc
index 9f071ce8c2d14be013d69e14fa387457b28b27cf..dd16f4f7f5825beb3e82f4eb5368a0b720d6f5ad 100644
--- a/paddle/pten/api/lib/manipulation.cc
+++ b/paddle/pten/api/lib/manipulation.cc
@@ -34,7 +34,7 @@ Tensor flatten(const Tensor& x, int start_axis, int stop_axis) {
 
   // 2. Get Device Context
   auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(*dev_ctx);
+  auto kernel_context = pten::KernelContext(dev_ctx);
 
   // 3. Auto data transform
   auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
diff --git a/paddle/pten/api/lib/math.cc b/paddle/pten/api/lib/math.cc
index 6cb7849e529e038eb8e253749d3876c1bec87029..8102bbaaa58eaecc0b9551032f70d00057f3b856 100644
--- a/paddle/pten/api/lib/math.cc
+++ b/paddle/pten/api/lib/math.cc
@@ -36,7 +36,7 @@ Tensor mean(const Tensor& x) {
 
   // 2. Get Device Context
   auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(*dev_ctx);
+  auto kernel_context = pten::KernelContext(dev_ctx);
 
   // 3. Auto data transform
   auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
diff --git a/paddle/pten/api/lib/utils/storage.h b/paddle/pten/api/lib/utils/storage.h
index 0a88c893f4dcf96e058cd85b0c43767dec9a6197..242ea6476ae983781d3d9eb1e959b5091b2495f4 100644
--- a/paddle/pten/api/lib/utils/storage.h
+++ b/paddle/pten/api/lib/utils/storage.h
@@ -75,6 +75,24 @@ class SharedStorage : public pten::Storage {
     return allocation_;
   }
 
+  // Temporary method: For compatible with fluid Tensor and improve performance
+  void ResetAllocation(std::shared_ptr<paddle::memory::Allocation> allocation,
+                       size_t offset) {
+    allocation_ = allocation;
+    data_ = pten::Allocation(
+        reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(allocation->ptr()) +
+                                offset),
+        allocation->place());
+    size_ = allocation->size();
+  }
+
+  // Temporary method: For compatible with fluid Tensor and improve performance
+  void Reset() {
+    allocation_.reset();
+    data_.Clear();
+    size_ = 0;
+  }
+
  private:
   int64_t size_{0};
   std::shared_ptr<paddle::memory::Allocation> allocation_;
diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc
index 628fde3a1a4ddb089979356292e23df01f3afb4b..52554bf7af0cadeff416546ec7c21cfe2988a189 100644
--- a/paddle/pten/api/lib/utils/tensor_utils.cc
+++ b/paddle/pten/api/lib/utils/tensor_utils.cc
@@ -14,6 +14,10 @@ limitations under the License. */
 
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 
+#include <vector>
+
+#include "paddle/pten/core/compat_utils.h"
+
 namespace paddle {
 namespace experimental {
 
@@ -126,5 +130,101 @@ void MovesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst) {
   MovesStorage(src, static_cast<paddle::framework::Tensor*>(dst));
 }
 
+void ReMakePtenDenseTensor(const paddle::framework::Tensor& src,
+                           pten::DenseTensor* dst) {
+  auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst);
+  meta->dims = src.dims();
+  // Since the type of DenseTensorMeta is const, const_cast must be used
+  const_cast<DataType&>(meta->type) = pten::TransToPtenDataType(src.type());
+  // Since the type of DenseTensorMeta is const, const_cast must be used
+  const_cast<DataLayout&>(meta->layout) =
+      pten::TransToPtenDataLayout(src.layout());
+  auto* shared_storage = static_cast<SharedStorage*>(
+      pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(dst));
+  PADDLE_ENFORCE_NOT_NULL(
+      shared_storage,
+      platform::errors::NotFound(
+          "Target DenseTensor's shared storage is nullptr."));
+  shared_storage->ResetAllocation(src.Holder(), src.offset());
+}
+
+void ReMakePtenDenseTensor(const paddle::framework::LoDTensor& src,
+                           pten::DenseTensor* dst) {
+  auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst);
+  meta->dims = src.dims();
+  // Since the type of DenseTensorMeta is const, const_cast must be used
+  const_cast<DataType&>(meta->type) = pten::TransToPtenDataType(src.type());
+  // Since the type of DenseTensorMeta is const, const_cast must be used
+  const_cast<DataLayout&>(meta->layout) =
+      pten::TransToPtenDataLayout(src.layout());
+  SetLoD(&(meta->lod), src.lod());
+  auto* shared_storage = static_cast<SharedStorage*>(
+      pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(dst));
+  PADDLE_ENFORCE_NOT_NULL(
+      shared_storage,
+      platform::errors::NotFound(
+          "Target DenseTensor's shared storage is nullptr."));
+  shared_storage->ResetAllocation(src.Holder(), src.offset());
+}
+
+void ReMakePtenDenseTensorFromVar(const framework::Variable& variable,
+                                  const pten::TensorArgDef& arg_def,
+                                  pten::DenseTensor* dst) {
+  auto expected_place = pten::TransToFluidPlace(arg_def.backend);
+
+  if (variable.IsType<framework::LoDTensor>()) {
+    const auto& tensor = variable.Get<framework::LoDTensor>();
+    if (!platform::is_same_place(tensor.place(), expected_place)) {
+      framework::LoDTensor tmp_tensor;
+      framework::TensorCopySync(tensor, expected_place, &tmp_tensor);
+      ReMakePtenDenseTensor(tmp_tensor, dst);
+    } else {
+      ReMakePtenDenseTensor(tensor, dst);
+    }
+  } else if (variable.IsType<framework::SelectedRows>()) {
+    // TODO(chenweihang): now we don't deal with row and height
+    // by xiaowei's advice
+    const auto& tensor = variable.Get<framework::SelectedRows>();
+    if (!platform::is_same_place(tensor.value().place(), expected_place)) {
+      framework::Tensor tmp_tensor;
+      TensorCopySync(tensor.value(), expected_place, &tmp_tensor);
+      // TODO(chenweihang): adapt SelectedRows by xiaowei's design
+      ReMakePtenDenseTensor(tmp_tensor, dst);
+    } else {
+      ReMakePtenDenseTensor(tensor.value(), dst);
+    }
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported shared input `%s` type now when call pt kernel.",
+        framework::ToTypeName(variable.Type())));
+  }
+}
+
+void ReMakePtenDenseTensorFromVar(framework::Variable* variable,
+                                  const pten::TensorArgDef& arg_def,
+                                  pten::DenseTensor* dst) {
+  // mutable_data before run kernel, to avoid share output form
+  // KernelContext to original tensor
+  if (variable->template IsType<framework::LoDTensor>()) {
+    auto* tensor = variable->template GetMutable<framework::LoDTensor>();
+    // TODO(chenweihang): use original var type if arg_def.dtype is UNDEFINED
+    tensor->mutable_data(pten::TransToFluidPlace(arg_def.backend),
+                         pten::TransToProtoVarType(arg_def.dtype));
+    ReMakePtenDenseTensor(*tensor, dst);
+  } else if (variable->template IsType<framework::SelectedRows>()) {
+    auto* tensor = variable->template GetMutable<framework::SelectedRows>();
+    tensor->mutable_value()->mutable_data(
+        pten::TransToFluidPlace(arg_def.backend),
+        pten::TransToProtoVarType(arg_def.dtype));
+    // TODO(chenweihang): adapt SelectedRows by xiaowei's design,
+    // here the row and height will lost in output!
+    ReMakePtenDenseTensor(tensor->value(), dst);
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported shared output `%s` type now when call pt kernel.",
+        framework::ToTypeName(variable->Type())));
+  }
+}
+
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/pten/api/lib/utils/tensor_utils.h b/paddle/pten/api/lib/utils/tensor_utils.h
index 625d6702f8b6d4814b2fbc52154ed4e4efc6fbdd..c1840d97fd2e33859fc5dfcff556c72a1ddab0ac 100644
--- a/paddle/pten/api/lib/utils/tensor_utils.h
+++ b/paddle/pten/api/lib/utils/tensor_utils.h
@@ -44,5 +44,29 @@ void MovesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst);
 
 void MovesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst);
 
+/**
+ * In order to improve the compatibility state performance, some tricky tool
+ * functions are added.
+ *
+ * The ReMake** function takes out the LoDTensor information and directly
+ * replaces it with the corresponding member of the DenseTensor to avoid
+ * the overhead caused by frequent construction and destruction of the
+ * DenseTensor.
+ */
+
+void ReMakePtenDenseTensor(const paddle::framework::Tensor& src,
+                           pten::DenseTensor* dst);
+
+void ReMakePtenDenseTensor(const paddle::framework::LoDTensor& src,
+                           pten::DenseTensor* dst);
+
+void ReMakePtenDenseTensorFromVar(const framework::Variable& variable,
+                                  const pten::TensorArgDef& arg_def,
+                                  pten::DenseTensor* dst);
+
+void ReMakePtenDenseTensorFromVar(framework::Variable* variable,
+                                  const pten::TensorArgDef& arg_def,
+                                  pten::DenseTensor* dst);
+
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/pten/core/compat_utils.h b/paddle/pten/core/compat_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..289c311bf3eba27f942c24657f9197f7c4b071e3
--- /dev/null
+++ b/paddle/pten/core/compat_utils.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/api/lib/utils/storage.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/storage.h"
+#include "paddle/pten/core/tensor_meta.h"
+
+namespace pten {
+
+/**
+ * In order to meet some adaptation requirements of the compatible state,
+ * these class is added to provide some tool functions.
+ *
+ * These utility functions may be deleted in the future, It is not recommended
+ * to be widely used in the framework
+ */
+
+class CompatibleDenseTensorUtils {
+ public:
+  static Storage* UnsafeGetMutableStorage(DenseTensor* tensor) {
+    return tensor->storage_.get();
+  }
+
+  static DenseTensorMeta* GetMutableMeta(DenseTensor* tensor) {
+    return &(tensor->meta_);
+  }
+
+  // only can deal with SharedStorage now
+  static void ClearStorage(DenseTensor* tensor) {
+    // use static_cast to improve performance, replace by dynamic_cast later
+    static_cast<paddle::experimental::SharedStorage*>(tensor->storage_.get())
+        ->Reset();
+  }
+};
+
+}  // namespace pten
diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h
index 46932ecac2ad0dd97cbc72c4e6a29ed16264f918..e8e57b333ae99e0cd836ddf0cab1b4f09664a749 100644
--- a/paddle/pten/core/dense_tensor.h
+++ b/paddle/pten/core/dense_tensor.h
@@ -21,6 +21,8 @@ limitations under the License. */
 
 namespace pten {
 
+class CompatibleDenseTensorUtils;
+
 /// \brief The Dense tensor store values in a contiguous sequential block
 /// of memory where all values are represented. Tensors or multi-dimensional
 /// arrays are used in math operators.
@@ -164,6 +166,9 @@ class DenseTensor : public TensorBase,
   /// \return The const data pointer value of raw type.
   const void* data() const;
 
+ private:
+  friend class CompatibleDenseTensorUtils;
+
  private:
   DenseTensorMeta meta_;
   intrusive_ptr<Storage> storage_;
diff --git a/paddle/pten/core/kernel_context.h b/paddle/pten/core/kernel_context.h
index ac1ed668f7bf5abbd3f0a9724a2921bb8a96bb41..973640906e0de0a121f5e87d7832e14db241969c 100644
--- a/paddle/pten/core/kernel_context.h
+++ b/paddle/pten/core/kernel_context.h
@@ -14,8 +14,10 @@
 
 #pragma once
 
+#include <iterator>
 #include <utility>
 
+#include "paddle/pten/core/compat_utils.h"
 #include "paddle/pten/core/tensor_base.h"
 #include "paddle/utils/any.h"
 #include "paddle/utils/small_vector.h"
@@ -39,16 +41,14 @@ using DataLayout = paddle::experimental::DataLayout;
  */
 class KernelContext {
  public:
-  explicit KernelContext(const DeviceContext& dev_ctx) : dev_ctx_(dev_ctx) {}
-  KernelContext(const DeviceContext& dev_ctx,
-                const paddle::SmallVector<std::shared_ptr<TensorBase>>& inputs,
-                const paddle::SmallVector<std::shared_ptr<TensorBase>>& outputs,
-                const paddle::SmallVector<paddle::any>& attrs)
-      : dev_ctx_(dev_ctx), inputs_(inputs), outputs_(outputs), attrs_(attrs) {}
+  KernelContext() = default;
+  explicit KernelContext(DeviceContext* dev_ctx) : dev_ctx_(dev_ctx) {}
+
+  void SetDeviceContext(DeviceContext* dev_ctx) { dev_ctx_ = dev_ctx; }
 
   template <typename CtxType>
   const CtxType& GetDeviceContext() const {
-    return static_cast<const CtxType&>(dev_ctx_);
+    return static_cast<const CtxType&>(*dev_ctx_);
   }
 
   void EmplaceBackInput(std::shared_ptr<TensorBase> input) {
@@ -59,14 +59,14 @@ class KernelContext {
   }
 
   void EmplaceBackInputs(
-      const paddle::SmallVector<std::shared_ptr<TensorBase>>& inputs) {
+      paddle::SmallVector<std::shared_ptr<TensorBase>> inputs) {
     int index = inputs_.size();
-    for (auto in : inputs) {
-      inputs_.emplace_back(std::move(in));
-    }
     // Record the start and end index of the input
     input_range_.emplace_back(
         std::pair<int, int>(index, index + inputs.size()));
+    inputs_.insert(inputs_.end(),
+                   std::make_move_iterator(inputs.begin()),
+                   std::make_move_iterator(inputs.end()));
   }
 
   void EmplaceBackOutput(std::shared_ptr<TensorBase> output) {
@@ -77,14 +77,14 @@ class KernelContext {
   }
 
   void EmplaceBackOutputs(
-      const paddle::SmallVector<std::shared_ptr<TensorBase>>& outputs) {
+      paddle::SmallVector<std::shared_ptr<TensorBase>> outputs) {
     int index = outputs_.size();
-    for (auto out : outputs) {
-      outputs_.emplace_back(std::move(out));
-    }
     // Record the start and end index of the input
     output_range_.emplace_back(
         std::pair<int, int>(index, index + outputs.size()));
+    outputs_.insert(outputs_.end(),
+                    std::make_move_iterator(outputs.begin()),
+                    std::make_move_iterator(outputs.end()));
   }
 
   void EmplaceBackAttr(paddle::any attr) {
@@ -115,6 +115,19 @@ class KernelContext {
     return output_range_.at(idx);
   }
 
+  std::pair<int, int>& MutableInputRangeAt(size_t idx) {
+    return input_range_[idx];
+  }
+
+  std::pair<int, int>& MutableOutputRangeAt(size_t idx) {
+    return output_range_[idx];
+  }
+
+  template <typename TensorType>
+  TensorType* MutableInputAt(size_t idx) {
+    return static_cast<TensorType*>(inputs_.at(idx).get());
+  }
+
   template <typename TensorType>
   TensorType* MutableOutputAt(size_t idx) {
     return static_cast<TensorType*>(outputs_.at(idx).get());
@@ -140,12 +153,30 @@ class KernelContext {
     }
   }
 
+  // Temporary method: For compatible with fluid Tensor and improve performance
+  // Only deal with DenseTensor now
+  void ClearData() {
+    for (auto& in : inputs_) {
+      CompatibleDenseTensorUtils::ClearStorage(
+          static_cast<DenseTensor*>(in.get()));
+    }
+    for (auto& out : outputs_) {
+      CompatibleDenseTensorUtils::ClearStorage(
+          static_cast<DenseTensor*>(out.get()));
+    }
+    attrs_.clear();
+  }
+
+  size_t InputsSize() const { return inputs_.size(); }
+  size_t OutputsSize() const { return outputs_.size(); }
+  size_t AttrsSize() const { return attrs_.size(); }
+
  private:
   bool IsDuplicable() const { return input_range_.size() != inputs_.size(); }
 
  private:
   // DeviceContext base class
-  const DeviceContext& dev_ctx_;
+  DeviceContext* dev_ctx_;
 
   // TODO(chenweihang): Tensor -> Tensor*, Tensor should by managed `scope`
   // Note: can't use API Tensor here, the inference don't use this API Tensor
@@ -156,11 +187,6 @@ class KernelContext {
   // Only contains input like list[Tensor] need `range`
   paddle::SmallVector<std::pair<int, int>> input_range_;
   paddle::SmallVector<std::pair<int, int>> output_range_;
-
-  // Only static graph need `name`
-  // TODO(chenweihang): replaced by paddle::string_view
-  paddle::SmallVector<std::string> input_names_;
-  paddle::SmallVector<std::string> output_names_;
 };
 
 }  // namespace pten