From 8784ec65cc4ebf8fdfce9d78b294165341f348ad Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Thu, 20 Jan 2022 10:19:19 +0800
Subject: [PATCH] =?UTF-8?q?=E3=80=90PTen=E3=80=91Remove=20code=20of=20conv?=
 =?UTF-8?q?erting=20Tensor=20to=20DensoeTensor=20(#38926)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* remove MakePtenTensor in BuildKernelContext

* fix a bug caused by storage

* remove WriteBackOutput in dynamic and static mode

* fix complie error of std::max

* fix complie error of std::max

* fix date_type bug

* fix memory alloc bug

* add some debug info

* fix compile problem

* fix problem of data_type check

* comment out some unreached code
---
 paddle/fluid/framework/operator.cc           |  93 +++++++++++++++--
 paddle/fluid/framework/operator.h            |   8 ++
 paddle/fluid/framework/pten_utils.cc         |  32 ++++--
 paddle/fluid/framework/pten_utils.h          |   3 +
 paddle/fluid/imperative/prepared_operator.cc |  96 +++++++++++-------
 paddle/pten/api/lib/utils.cc                 |   4 +-
 paddle/pten/api/lib/utils/tensor_utils.cc    | 100 ++-----------------
 paddle/pten/api/lib/utils/tensor_utils.h     |  15 +--
 paddle/pten/common/data_type.h               |   1 +
 paddle/pten/core/dense_tensor.cc             |   3 +-
 paddle/pten/core/kernel_context.cc           |  43 +++-----
 paddle/pten/core/kernel_context.h            |  35 +++----
 paddle/pten/kernels/gpu/copy_kernel.cu       |   6 +-
 paddle/pten/tests/api/scale_api.h            |   4 +-
 14 files changed, 231 insertions(+), 212 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index ea45ef857df..e69a6c2e88c 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1192,9 +1192,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
                                        platform::EventRole::kInnerOp);
     if (run_pten_kernel_) {
       pten::KernelContext pt_kernel_context;
+      // Do data transform before building KernelContext
+      PreparePtenData(exec_scope, *pt_kernel_, *pt_kernel_signature_,
+                      runtime_ctx);
       BuildPtenKernelContext(*runtime_ctx, dev_ctx, &pt_kernel_context);
       (*pt_kernel_)(&pt_kernel_context);
-      WriteBackToOutputs(runtime_ctx, &pt_kernel_context);
     } else {
       (*kernel_func_)(
           ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx));
@@ -1786,6 +1788,62 @@ KernelSignature OperatorWithKernel::GetExpectedPtenKernelArgs(
       pten::TransToPtenKernelName(Type()));
 }
 
+Scope* OperatorWithKernel::PreparePtenData(
+    const Scope& scope, const pten::Kernel& pt_kernel,
+    const KernelSignature& pt_kernel_signature, RuntimeContext* ctx) const {
+  auto& input_names = std::get<0>(pt_kernel_signature.args);
+  auto input_defs = pt_kernel.args_def().input_defs();
+  PADDLE_ENFORCE_EQ(input_names.size(), input_defs.size(),
+                    platform::errors::InvalidArgument(
+                        "The size of inputs_args names (%d) must be equal to "
+                        "the size of kernel input_defs (%d).",
+                        input_names.size(), input_defs.size()));
+  Scope* new_scope = nullptr;
+  for (size_t i = 0; i < input_defs.size(); ++i) {
+    auto& in_def = input_defs.at(i);
+    auto& ins_vector = ctx->inputs.at(input_names[i]);
+    for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
+      // Only tensor can be tranfer to another device.
+      auto* var = ins_vector[offset];
+      if (var == nullptr || !VarIsTensor(*var)) {
+        continue;
+      }
+
+      auto* tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+      if (!tensor_in->IsInitialized()) {
+        continue;
+      }
+
+      auto expected_place = pten::TransToFluidPlace(in_def.backend);
+      if (platform::is_same_place(tensor_in->place(), expected_place)) {
+        continue;
+      }
+
+      // TODO(zyfncg): Now there is no kernel which need to transform input
+      // data, so we commented out following code temporarily,
+      // and it will be used in the future.
+
+      // VLOG(3) << "PTen Transform Variable " << input_names[i] << " from "
+      //         << tensor_in->place() << " to " << expected_place;
+
+      // if (!new_scope) {
+      //   new_scope = &scope.NewScope();
+      // }
+
+      // // Create new var with the same name in transfer scopes
+      // auto* trans_var = new_scope->Var(input_names[i]);
+      // ins_vector[i] = trans_var;
+
+      // // Do transfer
+      // Tensor out;
+      // framework::TensorCopySync(*tensor_in, expected_place, &out);
+      // SetTensorToVariable(*var, out, trans_var);
+    }
+  }
+
+  return new_scope;
+}
+
 void OperatorWithKernel::BuildPtenKernelContext(
     const RuntimeContext& ctx, platform::DeviceContext* dev_ctx,
     pten::KernelContext* pt_kernel_context) const {
@@ -1818,7 +1876,6 @@ void OperatorWithKernel::BuildPtenKernelContext(
                         attr_names.size(), attr_defs.size()));
 
   for (size_t i = 0; i < input_names.size(); ++i) {
-    auto& in_def = input_defs.at(i);
     auto& ins_vector = ctx.inputs.at(input_names[i]);
 
     // calcute the start and end index of the input tensors
@@ -1827,14 +1884,22 @@ void OperatorWithKernel::BuildPtenKernelContext(
     size_t end_idx = start_idx + ins_vector.size();
 
     for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
-      pt_kernel_context->EmplaceBackInputWithoutSetRange(
-          experimental::MakePtenTensorBaseFromVar(*ins_vector[offset], in_def));
+      const framework::Tensor* tensor_in = nullptr;
+      auto* var = ins_vector[offset];
+      if (var->IsType<framework::LoDTensor>()) {
+        tensor_in = &(var->Get<framework::LoDTensor>());
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported input `%s` type when call pt kernel.",
+            framework::ToTypeName(var->Type())));
+      }  // TODO(zyfncg): Add support for SelectedRows
+
+      pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
     }
     pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i);
   }
 
   for (size_t i = 0; i < output_names.size(); ++i) {
-    auto& out_def = output_defs.at(i);
     auto& outs_vector = ctx.outputs.at(output_names[i]);
 
     size_t start_idx =
@@ -1842,9 +1907,21 @@ void OperatorWithKernel::BuildPtenKernelContext(
     size_t end_idx = start_idx + outs_vector.size();
 
     for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
-      pt_kernel_context->EmplaceBackOutputWithoutSetRange(
-          experimental::MakePtenTensorBaseFromVar(outs_vector[offset],
-                                                  out_def));
+      framework::Tensor* tensor_out = nullptr;
+      auto* var = outs_vector[offset];
+      if (var->template IsType<framework::LoDTensor>()) {
+        tensor_out = var->template GetMutable<framework::LoDTensor>();
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported output `%s` type when call pt kernel.",
+            framework::ToTypeName(var->Type())));
+      }  // TODO(zyfncg): Add support for SelectedRows
+
+      experimental::ResetTensorByArgDef(tensor_out, output_defs.at(i));
+      SetAllocationForOutputTenosr(
+          tensor_out, pten::TransToFluidPlace(output_defs.at(i).backend));
+
+      pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
     }
 
     // Deal with the case that some outputs are NULL when run the kernel.
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 3aab9165eae..ad84dbc9be6 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -588,6 +588,14 @@ class OperatorWithKernel : public OperatorBase {
   /* member functions for adapting to pten lib */
   void ChoosePtenKernel(const ExecutionContext& ctx) const;
 
+  /**
+   * Transfer data place for pten kernel
+   * Is this really needed?
+   */
+  Scope* PreparePtenData(const Scope& scope, const pten::Kernel& pt_kernel,
+                         const KernelSignature& pt_kernel_signature,
+                         RuntimeContext* ctx) const;
+
   void BuildPtenKernelContext(const RuntimeContext& ctx,
                               platform::DeviceContext* dev_ctx,
                               pten::KernelContext* pt_kernel_context) const;
diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc
index dddcd914ed2..4e33e641cf1 100644
--- a/paddle/fluid/framework/pten_utils.cc
+++ b/paddle/fluid/framework/pten_utils.cc
@@ -137,17 +137,17 @@ KernelArgsNameMakerByOpProto::GetInputArgsNames() {
     auto& in = op_proto_->inputs()[i];
     auto& in_name = in.name();
     if ((in.has_extra() && in.extra()) || (in.has_quant() && in.quant())) {
-      VLOG(3) << "Parse PtenKernel input: skip extra & quant input - "
+      VLOG(6) << "Parse PtenKernel input: skip extra & quant input - "
               << in_name;
       continue;
     }
     // If contains dispensable input, we should override the
     // GetExpectedPtenKernelArgs method self
     if (in.has_dispensable() && in.dispensable()) {
-      VLOG(3) << "Parse PtenKernel input: skip dispensable input - " << in_name;
+      VLOG(6) << "Parse PtenKernel input: skip dispensable input - " << in_name;
       continue;
     }
-    VLOG(3) << "Parse PtenKernel input: " << in_name;
+    VLOG(6) << "Parse PtenKernel input: " << in_name;
     input_names_.emplace_back(in_name);
   }
   return input_names_;
@@ -159,7 +159,7 @@ KernelArgsNameMakerByOpProto::GetOutputArgsNames() {
     auto& out = op_proto_->outputs()[i];
     auto& out_name = out.name();
     // TODO(chenweihang): outputs also need skip some cases
-    VLOG(3) << "Parse PtenKernel output: " << out_name;
+    VLOG(6) << "Parse PtenKernel output: " << out_name;
     output_names_.emplace_back(out_name);
   }
   return output_names_;
@@ -173,17 +173,17 @@ KernelArgsNameMakerByOpProto::GetAttrsArgsNames() {
     if (attr_name == "use_mkldnn" || attr_name == "op_role" ||
         attr_name == "op_role_var" || attr_name == "op_namescope" ||
         attr_name == "op_callstack" || attr_name == "op_device") {
-      VLOG(3) << "Parse PtenKernel attribute: skip needless attr - "
+      VLOG(6) << "Parse PtenKernel attribute: skip needless attr - "
               << attr_name;
       continue;
     }
     if ((attr.has_extra() && attr.extra()) ||
         (attr.has_quant() && attr.quant())) {
-      VLOG(3) << "Parse PtenKernel attribute: skip extra & quant attr - "
+      VLOG(6) << "Parse PtenKernel attribute: skip extra & quant attr - "
               << attr_name;
       continue;
     }
-    VLOG(3) << "Parse PtenKernel attribute: " << attr_name;
+    VLOG(6) << "Parse PtenKernel attribute: " << attr_name;
     attr_names_.emplace_back(attr_name);
   }
 
@@ -196,5 +196,23 @@ KernelSignature KernelArgsNameMakerByOpProto::GetKernelSignature() {
                          GetOutputArgsNames());
 }
 
+void SetAllocationForOutputTenosr(pten::DenseTensor* tensor,
+                                  const platform::Place& place) {
+  if (!tensor->IsInitialized() || !(tensor->place() == place)) {
+    int dtype_size = tensor->dtype() == DataType::UNDEFINED
+                         ? 0
+                         : experimental::SizeOf(tensor->dtype());
+    int64_t numels = product(tensor->dims());
+    numels = numels < 0 ? 0 : numels;
+    auto tmp_allocation_ptr = memory::Alloc(place, numels * dtype_size);
+    auto& deleter = tmp_allocation_ptr.get_deleter();
+    auto* allocation_ptr = tmp_allocation_ptr.release();
+    auto shared_allocation =
+        std::shared_ptr<pten::Allocation>(allocation_ptr, deleter);
+
+    tensor->ResetHolder(shared_allocation);
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/pten_utils.h b/paddle/fluid/framework/pten_utils.h
index 09d96045949..8bbd4f7f3c9 100644
--- a/paddle/fluid/framework/pten_utils.h
+++ b/paddle/fluid/framework/pten_utils.h
@@ -72,5 +72,8 @@ class KernelArgsNameMaker {
   virtual const paddle::SmallVector<std::string>& GetAttrsArgsNames() = 0;
 };
 
+void SetAllocationForOutputTenosr(pten::DenseTensor* tensor,
+                                  const platform::Place& place);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 6474f3c07fa..bb08191af98 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -258,6 +258,49 @@ PreparedOp PreparedOp::Prepare(const NameVarMap<VariableWrapper>& ins,
                                       default_attrs);
 }
 
+template <typename VarType>
+void PreparePtenData(const pten::Kernel& pt_kernel,
+                     const framework::KernelSignature& pt_kernel_signature,
+                     const NameVarMap<VarType>& ins) {
+  auto& input_names = std::get<0>(pt_kernel_signature.args);
+  auto& input_defs = pt_kernel.args_def().input_defs();
+
+  PADDLE_ENFORCE_EQ(input_names.size(), input_defs.size(),
+                    platform::errors::InvalidArgument(
+                        "the size of inputs_args names (%d) must be equal to "
+                        "the size of kernel input_defs (%d).",
+                        input_names.size(), input_defs.size()));
+
+  for (size_t i = 0; i < input_names.size(); ++i) {
+    auto& in_def = input_defs.at(i);
+    auto& ins_vector = ins.at(input_names[i]);
+
+    for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
+      auto var_base = ins_vector[offset];
+      const auto* tensor_in = GetTensorFromVar(var_base->Var());
+      if (tensor_in && tensor_in->IsInitialized()) {
+        auto expected_place = pten::TransToFluidPlace(in_def.backend);
+        if (platform::is_same_place(tensor_in->place(), expected_place)) {
+          continue;
+        }
+
+        // TODO(zyfncg): Now there is no kernel which need to transform input
+        // data, so we commented out following code temporarily,
+        // and it will be used in the future.
+
+        // VLOG(3) << "Pten Transform Variable " << var_base->Name() << " from "
+        //         << tensor_in->place() << " to " << expected_place;
+
+        // framework::Tensor tmp_tensor;
+        // framework::TensorCopySync(*tensor_in, expected_place, &tmp_tensor);
+
+        // SetTensorToVariable(var_base->Var(), tmp_tensor,
+        //                     var_base->MutableVar());
+      }
+    }
+  }
+}
+
 template <typename VarType>
 static void BuildDygraphPtenKernelContext(
     const framework::KernelSignature& pt_kernel_signature,
@@ -294,23 +337,19 @@ static void BuildDygraphPtenKernelContext(
                         attr_names.size(), attr_defs.size()));
 
   for (size_t i = 0; i < input_names.size(); ++i) {
-    auto& in_def = input_defs.at(i);
     auto& ins_vector = ins.at(input_names[i]);
 
     size_t start_idx = (i == 0 ? 0 : kernel_ctx->InputRangeAt(i - 1).second);
     size_t end_idx = start_idx + ins_vector.size();
 
     for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
-      const auto& variable = ins_vector[offset]->Var();
-      kernel_ctx->EmplaceBackInputWithoutSetRange(
-          paddle::experimental::MakePtenTensorBaseFromVar(variable, in_def));
+      const auto* tensor_in = GetTensorFromVar(ins_vector[offset]->Var());
+      kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in);
     }
     kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
   }
 
   for (size_t i = 0; i < output_names.size(); ++i) {
-    auto& out_def = output_defs.at(i);
-
     size_t start_idx = (i == 0 ? 0 : kernel_ctx->OutputRangeAt(i - 1).second);
 
     auto iter = outs.find(output_names[i]);
@@ -325,9 +364,21 @@ static void BuildDygraphPtenKernelContext(
     size_t end_idx = start_idx + outs_vector.size();
 
     for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
-      kernel_ctx->EmplaceBackOutputWithoutSetRange(
-          paddle::experimental::MakePtenTensorBaseFromVar(
-              outs_vector[offset]->MutableVar(), out_def));
+      auto* var = outs_vector[offset]->MutableVar();
+      framework::Tensor* tensor_out = nullptr;
+      if (var->template IsType<framework::LoDTensor>()) {
+        tensor_out = var->template GetMutable<framework::LoDTensor>();
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported output `%s` type when call pt kernel.",
+            framework::ToTypeName(var->Type())));
+      }  // TODO(zyfncg): Add support for SelectedRows
+
+      experimental::ResetTensorByArgDef(tensor_out, output_defs.at(i));
+      framework::SetAllocationForOutputTenosr(
+          tensor_out, pten::TransToFluidPlace(output_defs.at(i).backend));
+
+      kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
     }
     kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
   }
@@ -430,29 +481,6 @@ static void BuildDygraphPtenKernelContext(
   }
 }
 
-template <typename VarType>
-static void WriteBackToOutputs(
-    const framework::KernelSignature& pt_kernel_signature,
-    const NameVarMap<VarType>& outs, pten::KernelContext* kernel_ctx) {
-  auto& output_names = std::get<2>(pt_kernel_signature.args);
-
-  for (size_t i = 0; i < output_names.size(); ++i) {
-    auto iter = outs.find(output_names[i]);
-    if (iter != outs.end()) {
-      auto& outs_vector = iter->second;
-
-      auto& range_pair = kernel_ctx->OutputRangeAt(i);
-      auto pten_outs = kernel_ctx->MutableOutputBetween<pten::DenseTensor>(
-          range_pair.first, range_pair.second);
-
-      for (size_t j = 0; j < pten_outs.size(); ++j) {
-        experimental::MakeVariableFromPtenTensor(pten_outs[j],
-                                                 outs_vector[j]->MutableVar());
-      }
-    }
-  }
-}
-
 template <typename VarType>
 static void PreparedOpRunImpl(
     const framework::OperatorBase& op, const framework::RuntimeContext& ctx,
@@ -514,6 +542,8 @@ static void PreparedOpRunPtImpl(
       &ins, &outs, &attrs, &default_attrs, op.Type(), &kernel_type);
   op.Info().infer_shape_(&infer_shape_ctx);
 
+  PreparePtenData<VarType>(pt_kernel, pt_kernel_signature, ins);
+
   pten::KernelContext pt_kernel_context;
   BuildDygraphPtenKernelContext<VarType>(pt_kernel_signature, pt_kernel, ins,
                                          outs, attrs, default_attrs, dev_ctx,
@@ -529,8 +559,6 @@ static void PreparedOpRunPtImpl(
 #endif
   }
 
-  WriteBackToOutputs<VarType>(pt_kernel_signature, outs, &pt_kernel_context);
-
   // TODO(chenweihang): add debug flags later
   if (framework::IsComplexType(kernel_type.data_type_)) {
     HandleComplexGradToRealGrad<VarType>(outs);
diff --git a/paddle/pten/api/lib/utils.cc b/paddle/pten/api/lib/utils.cc
index 6eb1e5a3797..f42f3b37f0a 100644
--- a/paddle/pten/api/lib/utils.cc
+++ b/paddle/pten/api/lib/utils.cc
@@ -54,7 +54,7 @@ PADDLE_API Tensor copy_to(const Tensor& x, Backend backend, bool blocking) {
 
   // 3. Auto data transform
   auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
+  kernel_context.EmplaceBackInput(dense_x.get());
   kernel_context.EmplaceBackAttr(blocking);
 
   // 4. InferMeta
@@ -65,7 +65,7 @@ PADDLE_API Tensor copy_to(const Tensor& x, Backend backend, bool blocking) {
       pten::make_intrusive<paddle::experimental::SharedStorage>(
           pten::TransToFluidPlace(backend)),
       std::move(out_meta));
-  kernel_context.EmplaceBackOutput(dense_out);
+  kernel_context.EmplaceBackOutput(dense_out.get());
   Tensor out;
   out.set_impl(dense_out);
 
diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc
index 93b1957fe14..1420810007d 100644
--- a/paddle/pten/api/lib/utils/tensor_utils.cc
+++ b/paddle/pten/api/lib/utils/tensor_utils.cc
@@ -355,98 +355,6 @@ void ReMakePtenDenseTensor(const paddle::framework::Tensor& src,
                             dst);
 }
 
-void ReMakePtenDenseTensorByArgDefBase(const paddle::framework::Tensor& src,
-                                       const pten::TensorArgDef& arg_def,
-                                       pten::DenseTensor* dst) {
-  VLOG(3) << "ReMakePtenDenseTensor based Tensor and TensorArgDef.";
-  auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst);
-  meta->dims = src.dims();
-  meta->dtype = arg_def.dtype;
-  meta->layout = src.layout();
-  meta->offset = src.offset();
-
-  if (src.IsInitialized() &&
-      src.place() == pten::TransToFluidPlace(arg_def.backend)) {
-    dst->ResetHolder(src.Holder());
-  } else {
-    // This does not affect the correctness, and will be modified immediately.
-    // dst->mutable_data(pten::TransToFluidPlace(arg_def.backend));
-  }
-}
-
-void ReMakePtenDenseTensorByArgDef(const paddle::framework::Tensor& src,
-                                   const pten::TensorArgDef& arg_def,
-                                   pten::DenseTensor* dst) {
-  auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst);
-  SetLoD(&meta->lod, src.lod());
-  ReMakePtenDenseTensorByArgDefBase(
-      static_cast<const paddle::framework::Tensor&>(src), arg_def, dst);
-}
-
-void ReMakePtenDenseTensorFromVar(const framework::Variable& variable,
-                                  const pten::TensorArgDef& arg_def,
-                                  pten::DenseTensor* dst) {
-  auto expected_place = pten::TransToFluidPlace(arg_def.backend);
-  if (variable.IsType<framework::LoDTensor>()) {
-    const auto& tensor = variable.Get<framework::LoDTensor>();
-    // check input dtype before ReMakePtenDenseTensor
-    PADDLE_ENFORCE(
-        (arg_def.dtype == pten::TransToPtenDataType(tensor.type())),
-        paddle::platform::errors::InvalidArgument(
-            "The type of input data is diffrent from the type of the "
-            "argument's definition in kernel."));
-    if (!platform::is_same_place(tensor.place(), expected_place)) {
-      framework::LoDTensor tmp_tensor;
-      framework::TensorCopySync(tensor, expected_place, &tmp_tensor);
-      ReMakePtenDenseTensorByArgDef(tmp_tensor, arg_def, dst);
-    } else {
-      ReMakePtenDenseTensorByArgDef(tensor, arg_def, dst);
-    }
-  } else if (variable.IsType<framework::SelectedRows>()) {
-    // TODO(chenweihang): now we don't deal with row and height
-    // by xiaowei's advice
-    const auto& tensor = variable.Get<framework::SelectedRows>();
-    PADDLE_ENFORCE(
-        (arg_def.dtype == pten::TransToPtenDataType(tensor.value().type())),
-        paddle::platform::errors::InvalidArgument(
-            "The type of input data is diffrent from the type of the "
-            "argument's definition in kernel."));
-    if (!platform::is_same_place(tensor.value().place(), expected_place)) {
-      framework::Tensor tmp_tensor;
-      paddle::framework::TensorCopySync(
-          tensor.value(), expected_place, &tmp_tensor);
-      // TODO(chenweihang): adapt SelectedRows by xiaowei's design
-      ReMakePtenDenseTensorByArgDef(tmp_tensor, arg_def, dst);
-    } else {
-      ReMakePtenDenseTensorByArgDef(tensor.value(), arg_def, dst);
-    }
-  } else {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Unsupported shared input `%s` type now when call pt kernel.",
-        framework::ToTypeName(variable.Type())));
-  }
-}
-
-void ReMakePtenDenseTensorFromVar(framework::Variable* variable,
-                                  const pten::TensorArgDef& arg_def,
-                                  pten::DenseTensor* dst) {
-  // mutable_data before run kernel, to avoid share output form
-  // KernelContext to original tensor
-  if (variable->template IsType<framework::LoDTensor>()) {
-    auto* tensor = variable->template GetMutable<framework::LoDTensor>();
-    ReMakePtenDenseTensorByArgDef(*tensor, arg_def, dst);
-  } else if (variable->template IsType<framework::SelectedRows>()) {
-    auto* tensor = variable->template GetMutable<framework::SelectedRows>();
-    // TODO(chenweihang): adapt SelectedRows by xiaowei's design,
-    // here the row and height will lost in output!
-    ReMakePtenDenseTensorByArgDef(tensor->value(), arg_def, dst);
-  } else {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Unsupported shared output `%s` type now when call pt kernel.",
-        framework::ToTypeName(variable->Type())));
-  }
-}
-
 static bool IsSameAllocation(const std::shared_ptr<memory::Allocation>& a,
                              const std::shared_ptr<memory::Allocation>& b) {
   return a->ptr() == b->ptr() && a->size() == b->size() &&
@@ -489,5 +397,13 @@ void MakeVariableFromPtenTensor(pten::DenseTensor* src,
   }
 }
 
+void ResetTensorByArgDef(pten::DenseTensor* dst,
+                         const pten::TensorArgDef& arg_def) {
+  VLOG(5) << "ResetTensor by TensorArgDef.";
+  auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst);
+  meta->dtype = arg_def.dtype;
+  meta->layout = arg_def.layout;
+}
+
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/pten/api/lib/utils/tensor_utils.h b/paddle/pten/api/lib/utils/tensor_utils.h
index 0ac4ac7a331..04c3f0e912b 100644
--- a/paddle/pten/api/lib/utils/tensor_utils.h
+++ b/paddle/pten/api/lib/utils/tensor_utils.h
@@ -67,20 +67,11 @@ void SharesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst);
 void ReMakePtenDenseTensor(const paddle::framework::Tensor& src,
                            pten::DenseTensor* dst);
 
-void ReMakePtenDenseTensorByArgDef(const paddle::framework::Tensor& src,
-                                   const pten::TensorArgDef& arg_def,
-                                   pten::DenseTensor* dst);
-
-void ReMakePtenDenseTensorFromVar(const framework::Variable& variable,
-                                  const pten::TensorArgDef& arg_def,
-                                  pten::DenseTensor* dst);
-
-void ReMakePtenDenseTensorFromVar(framework::Variable* variable,
-                                  const pten::TensorArgDef& arg_def,
-                                  pten::DenseTensor* dst);
-
 void MakeVariableFromPtenTensor(pten::DenseTensor* src,
                                 framework::Variable* variable);
 
+void ResetTensorByArgDef(pten::DenseTensor* dst,
+                         const pten::TensorArgDef& arg_def);
+
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/pten/common/data_type.h b/paddle/pten/common/data_type.h
index a00d68c5354..306507d2d2b 100644
--- a/paddle/pten/common/data_type.h
+++ b/paddle/pten/common/data_type.h
@@ -73,6 +73,7 @@ inline size_t SizeOf(DataType data_type) {
     case DataType::COMPLEX128:
       return 16;
     case DataType::UNDEFINED:
+      return 0;
     case DataType::NUM_DATA_TYPES:
       PD_THROW("Data type `",
                static_cast<int>(data_type),
diff --git a/paddle/pten/core/dense_tensor.cc b/paddle/pten/core/dense_tensor.cc
index 06531fe8bfd..cfe2cfa03ea 100644
--- a/paddle/pten/core/dense_tensor.cc
+++ b/paddle/pten/core/dense_tensor.cc
@@ -113,7 +113,8 @@ void* DenseTensor::mutable_data(size_t request_bytes) {
                           bytes));
     bytes = request_bytes;
   }
-  if (storage_->size() < bytes + meta_.offset || storage_->size() == 0) {
+  if (!storage_->data() || storage_->size() < bytes + meta_.offset ||
+      storage_->size() == 0) {
     VLOG(10) << "mutbale data realloc, original size: " << storage_->size()
              << ", new size: " << bytes;
     storage_->Realloc(bytes);
diff --git a/paddle/pten/core/kernel_context.cc b/paddle/pten/core/kernel_context.cc
index 74bd6d17f06..34e9fabbe67 100644
--- a/paddle/pten/core/kernel_context.cc
+++ b/paddle/pten/core/kernel_context.cc
@@ -16,20 +16,19 @@
 
 namespace pten {
 
-void KernelContext::EmplaceBackInput(std::shared_ptr<TensorBase> input) {
+void KernelContext::EmplaceBackInput(const TensorBase* input) {
   int index = inputs_.size();
-  inputs_.emplace_back(std::move(input));
+  inputs_.emplace_back(input);
   // Record the start and end index of the input
   input_range_.emplace_back(std::pair<int, int>(index, index + 1));
 }
 
-void KernelContext::EmplaceBackInputWithoutSetRange(
-    std::shared_ptr<TensorBase> input) {
-  inputs_.emplace_back(std::move(input));
+void KernelContext::EmplaceBackInputWithoutSetRange(const TensorBase* input) {
+  inputs_.emplace_back(input);
 }
 
 void KernelContext::EmplaceBackInputs(
-    paddle::SmallVector<std::shared_ptr<TensorBase>> inputs) {
+    paddle::SmallVector<const TensorBase*> inputs) {
   int index = inputs_.size();
   // Record the start and end index of the input
   input_range_.emplace_back(std::pair<int, int>(index, index + inputs.size()));
@@ -38,25 +37,23 @@ void KernelContext::EmplaceBackInputs(
                  std::make_move_iterator(inputs.end()));
 }
 
-void KernelContext::EmplaceBackOutput(std::shared_ptr<TensorBase> output) {
+void KernelContext::EmplaceBackOutput(TensorBase* output) {
   int index = outputs_.size();
-  outputs_.emplace_back(std::move(output));
+  outputs_.emplace_back(output);
   // Record the start and end index of the input
   output_range_.emplace_back(std::pair<int, int>(index, index + 1));
 }
 
-void KernelContext::EmplaceBackOutputWithoutSetRange(
-    std::shared_ptr<TensorBase> output) {
-  outputs_.emplace_back(std::move(output));
+void KernelContext::EmplaceBackOutputWithoutSetRange(TensorBase* output) {
+  outputs_.emplace_back(output);
 }
 
-void KernelContext::SetOutputWithoutSetRange(
-    int index, std::shared_ptr<TensorBase> output) {
-  outputs_.at(index) = std::move(output);
+void KernelContext::SetOutputWithoutSetRange(int index, TensorBase* output) {
+  outputs_.at(index) = output;
 }
 
 void KernelContext::EmplaceBackOutputs(
-    paddle::SmallVector<std::shared_ptr<TensorBase>> outputs) {
+    paddle::SmallVector<TensorBase*> outputs) {
   int index = outputs_.size();
   // Record the start and end index of the input
   output_range_.emplace_back(
@@ -116,19 +113,5 @@ std::pair<int, int>& KernelContext::MutableOutputRangeAt(size_t idx) {
 
 // Temporary method: For compatible with fluid Tensor and improve performance
 // Only deal with DenseTensor now
-void KernelContext::ClearData() {
-  for (auto& in : inputs_) {
-    if (in) {
-      CompatibleDenseTensorUtils::ClearStorage(
-          static_cast<DenseTensor*>(in.get()));
-    }
-  }
-  for (auto& out : outputs_) {
-    if (out) {
-      CompatibleDenseTensorUtils::ClearStorage(
-          static_cast<DenseTensor*>(out.get()));
-    }
-  }
-  attrs_.clear();
-}
+void KernelContext::ClearData() { attrs_.clear(); }
 }  // namespace pten
diff --git a/paddle/pten/core/kernel_context.h b/paddle/pten/core/kernel_context.h
index b6cc15c084a..5559b348aa1 100644
--- a/paddle/pten/core/kernel_context.h
+++ b/paddle/pten/core/kernel_context.h
@@ -51,21 +51,19 @@ class KernelContext {
     return static_cast<const CtxType&>(*dev_ctx_);
   }
 
-  void EmplaceBackInput(std::shared_ptr<TensorBase> input);
+  void EmplaceBackInput(const TensorBase* input);
 
-  void EmplaceBackInputWithoutSetRange(std::shared_ptr<TensorBase> input);
+  void EmplaceBackInputWithoutSetRange(const TensorBase* input);
 
-  void EmplaceBackInputs(
-      paddle::SmallVector<std::shared_ptr<TensorBase>> inputs);
+  void EmplaceBackInputs(paddle::SmallVector<const TensorBase*> inputs);
 
-  void EmplaceBackOutput(std::shared_ptr<TensorBase> output);
+  void EmplaceBackOutput(TensorBase* output);
 
-  void EmplaceBackOutputWithoutSetRange(std::shared_ptr<TensorBase> output);
+  void EmplaceBackOutputWithoutSetRange(TensorBase* output);
 
-  void SetOutputWithoutSetRange(int index, std::shared_ptr<TensorBase> output);
+  void EmplaceBackOutputs(paddle::SmallVector<TensorBase*> outputs);
 
-  void EmplaceBackOutputs(
-      paddle::SmallVector<std::shared_ptr<TensorBase>> outputs);
+  void SetOutputWithoutSetRange(int index, TensorBase* output);
 
   void EmplaceBackAttr(paddle::any attr);
 
@@ -90,16 +88,12 @@ class KernelContext {
                  : paddle::optional<const TensorType&>{paddle::none};
   }
 
-  std::shared_ptr<TensorBase>& MutableInputPtrAt(size_t idx) {
-    return inputs_.at(idx);
-  }
-
   template <typename TensorType>
   std::vector<TensorType> MoveInputsBetween(size_t start, size_t end) {
     std::vector<TensorType> v;
     for (size_t i = start; i < end; ++i) {
       auto t = std::dynamic_pointer_cast<TensorType>(inputs_.at(i));
-      v.emplace_back(std::move(*t.get()));
+      v.emplace_back(*t);
       inputs_.at(i) = nullptr;
     }
     return v;
@@ -109,21 +103,16 @@ class KernelContext {
 
   void AssignOutputRange(std::pair<int, int>&& range, size_t idx);
 
-  template <typename TensorType>
-  TensorType* MutableInputAt(size_t idx) {
-    return static_cast<TensorType*>(inputs_.at(idx).get());
-  }
-
   template <typename TensorType>
   TensorType* MutableOutputAt(size_t idx) {
-    return static_cast<TensorType*>(outputs_.at(idx).get());
+    return static_cast<TensorType*>(outputs_.at(idx));
   }
 
   template <typename TensorType>
   std::vector<TensorType*> MutableOutputBetween(size_t start, size_t end) {
     std::vector<TensorType*> v;
     for (size_t i = start; i < end; ++i) {
-      v.emplace_back(static_cast<TensorType*>(outputs_.at(i).get()));
+      v.emplace_back(static_cast<TensorType*>(outputs_.at(i)));
     }
 
     return v;
@@ -153,8 +142,8 @@ class KernelContext {
 
   // TODO(chenweihang): Tensor -> Tensor*, Tensor should by managed `scope`
   // Note: can't use API Tensor here, the inference don't use this API Tensor
-  paddle::SmallVector<std::shared_ptr<TensorBase>> inputs_;
-  paddle::SmallVector<std::shared_ptr<TensorBase>> outputs_;
+  paddle::SmallVector<const TensorBase*> inputs_;
+  paddle::SmallVector<TensorBase*> outputs_;
   paddle::SmallVector<paddle::any> attrs_;
 
   // Only contains input like list[Tensor] need `range`
diff --git a/paddle/pten/kernels/gpu/copy_kernel.cu b/paddle/pten/kernels/gpu/copy_kernel.cu
index 10b2aa415d4..1f7a08e8254 100644
--- a/paddle/pten/kernels/gpu/copy_kernel.cu
+++ b/paddle/pten/kernels/gpu/copy_kernel.cu
@@ -31,7 +31,7 @@ void Copy(const Context& dev_ctx,
           DenseTensor* dst) {
   auto* src_ptr = src.data();
   const auto& src_place = src.place();
-  const auto& dst_place = dst->place();
+  auto dst_place = dst->place();
 
   if (src_place == dst_place && paddle::platform::is_cpu_place(src_place)) {
     PADDLE_THROW(paddle::platform::errors::InvalidArgument(
@@ -51,6 +51,7 @@ void Copy(const Context& dev_ctx,
     return;
   }
   VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
+
   CHECK(dst->layout() == src.layout());
 
   auto size = src.numel() *
@@ -208,6 +209,9 @@ void Copy(const Context& dev_ctx,
             "Context place dose not match the source and destination place."));
       }
     }
+  } else {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "Place type error. Please check the place of src and dst Tensor."));
   }
 }
 
diff --git a/paddle/pten/tests/api/scale_api.h b/paddle/pten/tests/api/scale_api.h
index 41143826c45..0ba1d6a0e3f 100644
--- a/paddle/pten/tests/api/scale_api.h
+++ b/paddle/pten/tests/api/scale_api.h
@@ -62,7 +62,7 @@ PADDLE_API Tensor scale_kernel_context(const Tensor& x,
   auto kernel_context = pten::KernelContext(dev_ctx);
 
   auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
+  kernel_context.EmplaceBackInput(dense_x.get());
 
   kernel_context.EmplaceBackAttr(pten::Scalar(scale));
   kernel_context.EmplaceBackAttr(bias);
@@ -73,7 +73,7 @@ PADDLE_API Tensor scale_kernel_context(const Tensor& x,
       pten::make_intrusive<paddle::experimental::SharedStorage>(
           pten::TransToFluidPlace(kernel_backend)),
       std::move(out_meta));
-  kernel_context.EmplaceBackOutput(dense_out);
+  kernel_context.EmplaceBackOutput(dense_out.get());
 
   Tensor out;
   out.set_impl(dense_out);
-- 
GitLab