【PTen】Remove code of converting Tensor to DensoeTensor (#38926)

* remove MakePtenTensor in BuildKernelContext * fix a bug caused by storage * remove WriteBackOutput in dynamic and static mode * fix complie error of std::max * fix complie error of std::max * fix date_type bug * fix memory alloc bug * add some debug info * fix compile problem * fix problem of data_type check * comment out some unreached code

【PTen】Remove code of converting Tensor to DensoeTensor (#38926)
* remove MakePtenTensor in BuildKernelContext * fix a bug caused by storage * remove WriteBackOutput in dynamic and static mode * fix complie error of std::max * fix complie error of std::max * fix date_type bug * fix memory alloc bug * add some debug info * fix compile problem * fix problem of data_type check * comment out some unreached code
8784ec65 · zyfncg · GitHub · 90e9233a · 8784ec65 · 8784ec65
14 changed file
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1192,9 +1192,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
                                       platform::EventRole::kInnerOp);
    if (run_pten_kernel_) {
      pten::KernelContext pt_kernel_context;
+      // Do data transform before building KernelContext
+      PreparePtenData(exec_scope, *pt_kernel_, *pt_kernel_signature_,
+                      runtime_ctx);
      BuildPtenKernelContext(*runtime_ctx, dev_ctx, &pt_kernel_context);
      (*pt_kernel_)(&pt_kernel_context);
-      WriteBackToOutputs(runtime_ctx, &pt_kernel_context);
    } else {
      (*kernel_func_)(
          ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx));
@@ -1786,6 +1788,62 @@ KernelSignature OperatorWithKernel::GetExpectedPtenKernelArgs(
      pten::TransToPtenKernelName(Type()));
 }
+Scope* OperatorWithKernel::PreparePtenData(
+    const Scope& scope, const pten::Kernel& pt_kernel,
+    const KernelSignature& pt_kernel_signature, RuntimeContext* ctx) const {
+  auto& input_names = std::get<0>(pt_kernel_signature.args);
+  auto input_defs = pt_kernel.args_def().input_defs();
+  PADDLE_ENFORCE_EQ(input_names.size(), input_defs.size(),
+                    platform::errors::InvalidArgument(
+                        "The size of inputs_args names (%d) must be equal to "
+                        "the size of kernel input_defs (%d).",
+                        input_names.size(), input_defs.size()));
+  Scope* new_scope = nullptr;
+  for (size_t i = 0; i < input_defs.size(); ++i) {
+    auto& in_def = input_defs.at(i);
+    auto& ins_vector = ctx->inputs.at(input_names[i]);
+    for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
+      // Only tensor can be tranfer to another device.
+      auto* var = ins_vector[offset];
+      if (var == nullptr || !VarIsTensor(*var)) {
+        continue;
+      }
+      auto* tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+      if (!tensor_in->IsInitialized()) {
+        continue;
+      }
+      auto expected_place = pten::TransToFluidPlace(in_def.backend);
+      if (platform::is_same_place(tensor_in->place(), expected_place)) {
+        continue;
+      }
+      // TODO(zyfncg): Now there is no kernel which need to transform input
+      // data, so we commented out following code temporarily,
+      // and it will be used in the future.
+      // VLOG(3) << "PTen Transform Variable " << input_names[i] << " from "
+      //         << tensor_in->place() << " to " << expected_place;
+      // if (!new_scope) {
+      //   new_scope = &scope.NewScope();
+      // }
+      // // Create new var with the same name in transfer scopes
+      // auto* trans_var = new_scope->Var(input_names[i]);
+      // ins_vector[i] = trans_var;
+      // // Do transfer
+      // Tensor out;
+      // framework::TensorCopySync(*tensor_in, expected_place, &out);
+      // SetTensorToVariable(*var, out, trans_var);
+    }
+  }
+  return new_scope;
+}
 void OperatorWithKernel::BuildPtenKernelContext(
    const RuntimeContext& ctx, platform::DeviceContext* dev_ctx,
    pten::KernelContext* pt_kernel_context) const {
@@ -1818,7 +1876,6 @@ void OperatorWithKernel::BuildPtenKernelContext(
                        attr_names.size(), attr_defs.size()));
  for (size_t i = 0; i < input_names.size(); ++i) {
-    auto& in_def = input_defs.at(i);
    auto& ins_vector = ctx.inputs.at(input_names[i]);
    // calcute the start and end index of the input tensors
@@ -1827,14 +1884,22 @@ void OperatorWithKernel::BuildPtenKernelContext(
    size_t end_idx = start_idx + ins_vector.size();
    for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
-      pt_kernel_context->EmplaceBackInputWithoutSetRange(
+      const framework::Tensor* tensor_in = nullptr;
-          experimental::MakePtenTensorBaseFromVar(*ins_vector[offset], in_def));
+      auto* var = ins_vector[offset];
+      if (var->IsType<framework::LoDTensor>()) {
+        tensor_in = &(var->Get<framework::LoDTensor>());
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported input `%s` type when call pt kernel.",
+            framework::ToTypeName(var->Type())));
+      }  // TODO(zyfncg): Add support for SelectedRows
+      pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
    }
    pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i);
  }
  for (size_t i = 0; i < output_names.size(); ++i) {
-    auto& out_def = output_defs.at(i);
    auto& outs_vector = ctx.outputs.at(output_names[i]);
    size_t start_idx =
@@ -1842,9 +1907,21 @@ void OperatorWithKernel::BuildPtenKernelContext(
    size_t end_idx = start_idx + outs_vector.size();
    for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
-      pt_kernel_context->EmplaceBackOutputWithoutSetRange(
+      framework::Tensor* tensor_out = nullptr;
-          experimental::MakePtenTensorBaseFromVar(outs_vector[offset],
+      auto* var = outs_vector[offset];
-                                                  out_def));
+      if (var->template IsType<framework::LoDTensor>()) {
+        tensor_out = var->template GetMutable<framework::LoDTensor>();
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported output `%s` type when call pt kernel.",
+            framework::ToTypeName(var->Type())));
+      }  // TODO(zyfncg): Add support for SelectedRows
+      experimental::ResetTensorByArgDef(tensor_out, output_defs.at(i));
+      SetAllocationForOutputTenosr(
+          tensor_out, pten::TransToFluidPlace(output_defs.at(i).backend));
+      pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
    }
    // Deal with the case that some outputs are NULL when run the kernel.

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -588,6 +588,14 @@ class OperatorWithKernel : public OperatorBase {
  /* member functions for adapting to pten lib */
  void ChoosePtenKernel(const ExecutionContext& ctx) const;
+  /**
+   * Transfer data place for pten kernel
+   * Is this really needed?
+   */
+  Scope* PreparePtenData(const Scope& scope, const pten::Kernel& pt_kernel,
+                         const KernelSignature& pt_kernel_signature,
+                         RuntimeContext* ctx) const;
  void BuildPtenKernelContext(const RuntimeContext& ctx,
                              platform::DeviceContext* dev_ctx,
                              pten::KernelContext* pt_kernel_context) const;

--- a/paddle/fluid/framework/pten_utils.cc
+++ b/paddle/fluid/framework/pten_utils.cc
@@ -137,17 +137,17 @@ KernelArgsNameMakerByOpProto::GetInputArgsNames() {
    auto& in = op_proto_->inputs()[i];
    auto& in_name = in.name();
    if ((in.has_extra() && in.extra()) || (in.has_quant() && in.quant())) {
-      VLOG(3) << "Parse PtenKernel input: skip extra & quant input - "
+      VLOG(6) << "Parse PtenKernel input: skip extra & quant input - "
              << in_name;
      continue;
    }
    // If contains dispensable input, we should override the
    // GetExpectedPtenKernelArgs method self
    if (in.has_dispensable() && in.dispensable()) {
-      VLOG(3) << "Parse PtenKernel input: skip dispensable input - " << in_name;
+      VLOG(6) << "Parse PtenKernel input: skip dispensable input - " << in_name;
      continue;
    }
-    VLOG(3) << "Parse PtenKernel input: " << in_name;
+    VLOG(6) << "Parse PtenKernel input: " << in_name;
    input_names_.emplace_back(in_name);
  }
  return input_names_;
@@ -159,7 +159,7 @@ KernelArgsNameMakerByOpProto::GetOutputArgsNames() {
    auto& out = op_proto_->outputs()[i];
    auto& out_name = out.name();
    // TODO(chenweihang): outputs also need skip some cases
-    VLOG(3) << "Parse PtenKernel output: " << out_name;
+    VLOG(6) << "Parse PtenKernel output: " << out_name;
    output_names_.emplace_back(out_name);
  }
  return output_names_;
@@ -173,17 +173,17 @@ KernelArgsNameMakerByOpProto::GetAttrsArgsNames() {
    if (attr_name == "use_mkldnn" || attr_name == "op_role" ||
        attr_name == "op_role_var" || attr_name == "op_namescope" ||
        attr_name == "op_callstack" || attr_name == "op_device") {
-      VLOG(3) << "Parse PtenKernel attribute: skip needless attr - "
+      VLOG(6) << "Parse PtenKernel attribute: skip needless attr - "
              << attr_name;
      continue;
    }
    if ((attr.has_extra() && attr.extra()) ||
        (attr.has_quant() && attr.quant())) {
-      VLOG(3) << "Parse PtenKernel attribute: skip extra & quant attr - "
+      VLOG(6) << "Parse PtenKernel attribute: skip extra & quant attr - "
              << attr_name;
      continue;
    }
-    VLOG(3) << "Parse PtenKernel attribute: " << attr_name;
+    VLOG(6) << "Parse PtenKernel attribute: " << attr_name;
    attr_names_.emplace_back(attr_name);
  }
@@ -196,5 +196,23 @@ KernelSignature KernelArgsNameMakerByOpProto::GetKernelSignature() {
                         GetOutputArgsNames());
 }
+void SetAllocationForOutputTenosr(pten::DenseTensor* tensor,
+                                  const platform::Place& place) {
+  if (!tensor->IsInitialized() || !(tensor->place() == place)) {
+    int dtype_size = tensor->dtype() == DataType::UNDEFINED
+                         ? 0
+                         : experimental::SizeOf(tensor->dtype());
+    int64_t numels = product(tensor->dims());
+    numels = numels < 0 ? 0 : numels;
+    auto tmp_allocation_ptr = memory::Alloc(place, numels * dtype_size);
+    auto& deleter = tmp_allocation_ptr.get_deleter();
+    auto* allocation_ptr = tmp_allocation_ptr.release();
+    auto shared_allocation =
+        std::shared_ptr<pten::Allocation>(allocation_ptr, deleter);
+    tensor->ResetHolder(shared_allocation);
+  }
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/pten_utils.h
+++ b/paddle/fluid/framework/pten_utils.h
@@ -72,5 +72,8 @@ class KernelArgsNameMaker {
  virtual const paddle::SmallVector<std::string>& GetAttrsArgsNames() = 0;
 };
+void SetAllocationForOutputTenosr(pten::DenseTensor* tensor,
+                                  const platform::Place& place);
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -258,6 +258,49 @@ PreparedOp PreparedOp::Prepare(const NameVarMap<VariableWrapper>& ins,
                                      default_attrs);
 }
+template <typename VarType>
+void PreparePtenData(const pten::Kernel& pt_kernel,
+                     const framework::KernelSignature& pt_kernel_signature,
+                     const NameVarMap<VarType>& ins) {
+  auto& input_names = std::get<0>(pt_kernel_signature.args);
+  auto& input_defs = pt_kernel.args_def().input_defs();
+  PADDLE_ENFORCE_EQ(input_names.size(), input_defs.size(),
+                    platform::errors::InvalidArgument(
+                        "the size of inputs_args names (%d) must be equal to "
+                        "the size of kernel input_defs (%d).",
+                        input_names.size(), input_defs.size()));
+  for (size_t i = 0; i < input_names.size(); ++i) {
+    auto& in_def = input_defs.at(i);
+    auto& ins_vector = ins.at(input_names[i]);
+    for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
+      auto var_base = ins_vector[offset];
+      const auto* tensor_in = GetTensorFromVar(var_base->Var());
+      if (tensor_in && tensor_in->IsInitialized()) {
+        auto expected_place = pten::TransToFluidPlace(in_def.backend);
+        if (platform::is_same_place(tensor_in->place(), expected_place)) {
+          continue;
+        }
+        // TODO(zyfncg): Now there is no kernel which need to transform input
+        // data, so we commented out following code temporarily,
+        // and it will be used in the future.
+        // VLOG(3) << "Pten Transform Variable " << var_base->Name() << " from "
+        //         << tensor_in->place() << " to " << expected_place;
+        // framework::Tensor tmp_tensor;
+        // framework::TensorCopySync(*tensor_in, expected_place, &tmp_tensor);
+        // SetTensorToVariable(var_base->Var(), tmp_tensor,
+        //                     var_base->MutableVar());
+      }
+    }
+  }
+}
 template <typename VarType>
 static void BuildDygraphPtenKernelContext(
    const framework::KernelSignature& pt_kernel_signature,
@@ -294,23 +337,19 @@ static void BuildDygraphPtenKernelContext(
                        attr_names.size(), attr_defs.size()));
  for (size_t i = 0; i < input_names.size(); ++i) {
-    auto& in_def = input_defs.at(i);
    auto& ins_vector = ins.at(input_names[i]);
    size_t start_idx = (i == 0 ? 0 : kernel_ctx->InputRangeAt(i - 1).second);
    size_t end_idx = start_idx + ins_vector.size();
    for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
-      const auto& variable = ins_vector[offset]->Var();
+      const auto* tensor_in = GetTensorFromVar(ins_vector[offset]->Var());
-      kernel_ctx->EmplaceBackInputWithoutSetRange(
+      kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in);
-          paddle::experimental::MakePtenTensorBaseFromVar(variable, in_def));
    }
    kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
  }
  for (size_t i = 0; i < output_names.size(); ++i) {
-    auto& out_def = output_defs.at(i);
    size_t start_idx = (i == 0 ? 0 : kernel_ctx->OutputRangeAt(i - 1).second);
    auto iter = outs.find(output_names[i]);
@@ -325,9 +364,21 @@ static void BuildDygraphPtenKernelContext(
    size_t end_idx = start_idx + outs_vector.size();
    for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
-      kernel_ctx->EmplaceBackOutputWithoutSetRange(
+      auto* var = outs_vector[offset]->MutableVar();
-          paddle::experimental::MakePtenTensorBaseFromVar(
+      framework::Tensor* tensor_out = nullptr;
-              outs_vector[offset]->MutableVar(), out_def));
+      if (var->template IsType<framework::LoDTensor>()) {
+        tensor_out = var->template GetMutable<framework::LoDTensor>();
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported output `%s` type when call pt kernel.",
+            framework::ToTypeName(var->Type())));
+      }  // TODO(zyfncg): Add support for SelectedRows
+      experimental::ResetTensorByArgDef(tensor_out, output_defs.at(i));
+      framework::SetAllocationForOutputTenosr(
+          tensor_out, pten::TransToFluidPlace(output_defs.at(i).backend));
+      kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
    }
    kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
  }
@@ -430,29 +481,6 @@ static void BuildDygraphPtenKernelContext(
  }
 }
-template <typename VarType>
-static void WriteBackToOutputs(
-    const framework::KernelSignature& pt_kernel_signature,
-    const NameVarMap<VarType>& outs, pten::KernelContext* kernel_ctx) {
-  auto& output_names = std::get<2>(pt_kernel_signature.args);
-  for (size_t i = 0; i < output_names.size(); ++i) {
-    auto iter = outs.find(output_names[i]);
-    if (iter != outs.end()) {
-      auto& outs_vector = iter->second;
-      auto& range_pair = kernel_ctx->OutputRangeAt(i);
-      auto pten_outs = kernel_ctx->MutableOutputBetween<pten::DenseTensor>(
-          range_pair.first, range_pair.second);
-      for (size_t j = 0; j < pten_outs.size(); ++j) {
-        experimental::MakeVariableFromPtenTensor(pten_outs[j],
-                                                 outs_vector[j]->MutableVar());
-      }
-    }
-  }
-}
 template <typename VarType>
 static void PreparedOpRunImpl(
    const framework::OperatorBase& op, const framework::RuntimeContext& ctx,
@@ -514,6 +542,8 @@ static void PreparedOpRunPtImpl(
      &ins, &outs, &attrs, &default_attrs, op.Type(), &kernel_type);
  op.Info().infer_shape_(&infer_shape_ctx);
+  PreparePtenData<VarType>(pt_kernel, pt_kernel_signature, ins);
  pten::KernelContext pt_kernel_context;
  BuildDygraphPtenKernelContext<VarType>(pt_kernel_signature, pt_kernel, ins,
                                         outs, attrs, default_attrs, dev_ctx,
@@ -529,8 +559,6 @@ static void PreparedOpRunPtImpl(
 #endif
  }
-  WriteBackToOutputs<VarType>(pt_kernel_signature, outs, &pt_kernel_context);
  // TODO(chenweihang): add debug flags later
  if (framework::IsComplexType(kernel_type.data_type_)) {
    HandleComplexGradToRealGrad<VarType>(outs);

--- a/paddle/pten/api/lib/utils.cc
+++ b/paddle/pten/api/lib/utils.cc
@@ -54,7 +54,7 @@ PADDLE_API Tensor copy_to(const Tensor& x, Backend backend, bool blocking) {
  // 3. Auto data transform
  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
+  kernel_context.EmplaceBackInput(dense_x.get());
  kernel_context.EmplaceBackAttr(blocking);
  // 4. InferMeta
@@ -65,7 +65,7 @@ PADDLE_API Tensor copy_to(const Tensor& x, Backend backend, bool blocking) {
      pten::make_intrusive<paddle::experimental::SharedStorage>(
          pten::TransToFluidPlace(backend)),
      std::move(out_meta));
-  kernel_context.EmplaceBackOutput(dense_out);
+  kernel_context.EmplaceBackOutput(dense_out.get());
  Tensor out;
  out.set_impl(dense_out);

--- a/paddle/pten/api/lib/utils/tensor_utils.cc
+++ b/paddle/pten/api/lib/utils/tensor_utils.cc
@@ -355,98 +355,6 @@ void ReMakePtenDenseTensor(const paddle::framework::Tensor& src,
                            dst);
 }
-void ReMakePtenDenseTensorByArgDefBase(const paddle::framework::Tensor& src,
-                                       const pten::TensorArgDef& arg_def,
-                                       pten::DenseTensor* dst) {
-  VLOG(3) << "ReMakePtenDenseTensor based Tensor and TensorArgDef.";
-  auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst);
-  meta->dims = src.dims();
-  meta->dtype = arg_def.dtype;
-  meta->layout = src.layout();
-  meta->offset = src.offset();
-  if (src.IsInitialized() &&
-      src.place() == pten::TransToFluidPlace(arg_def.backend)) {
-    dst->ResetHolder(src.Holder());
-  } else {
-    // This does not affect the correctness, and will be modified immediately.
-    // dst->mutable_data(pten::TransToFluidPlace(arg_def.backend));
-  }
-}
-void ReMakePtenDenseTensorByArgDef(const paddle::framework::Tensor& src,
-                                   const pten::TensorArgDef& arg_def,
-                                   pten::DenseTensor* dst) {
-  auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst);
-  SetLoD(&meta->lod, src.lod());
-  ReMakePtenDenseTensorByArgDefBase(
-      static_cast<const paddle::framework::Tensor&>(src), arg_def, dst);
-}
-void ReMakePtenDenseTensorFromVar(const framework::Variable& variable,
-                                  const pten::TensorArgDef& arg_def,
-                                  pten::DenseTensor* dst) {
-  auto expected_place = pten::TransToFluidPlace(arg_def.backend);
-  if (variable.IsType<framework::LoDTensor>()) {
-    const auto& tensor = variable.Get<framework::LoDTensor>();
-    // check input dtype before ReMakePtenDenseTensor
-    PADDLE_ENFORCE(
-        (arg_def.dtype == pten::TransToPtenDataType(tensor.type())),
-        paddle::platform::errors::InvalidArgument(
-            "The type of input data is diffrent from the type of the "
-            "argument's definition in kernel."));
-    if (!platform::is_same_place(tensor.place(), expected_place)) {
-      framework::LoDTensor tmp_tensor;
-      framework::TensorCopySync(tensor, expected_place, &tmp_tensor);
-      ReMakePtenDenseTensorByArgDef(tmp_tensor, arg_def, dst);
-    } else {
-      ReMakePtenDenseTensorByArgDef(tensor, arg_def, dst);
-    }
-  } else if (variable.IsType<framework::SelectedRows>()) {
-    // TODO(chenweihang): now we don't deal with row and height
-    // by xiaowei's advice
-    const auto& tensor = variable.Get<framework::SelectedRows>();
-    PADDLE_ENFORCE(
-        (arg_def.dtype == pten::TransToPtenDataType(tensor.value().type())),
-        paddle::platform::errors::InvalidArgument(
-            "The type of input data is diffrent from the type of the "
-            "argument's definition in kernel."));
-    if (!platform::is_same_place(tensor.value().place(), expected_place)) {
-      framework::Tensor tmp_tensor;
-      paddle::framework::TensorCopySync(
-          tensor.value(), expected_place, &tmp_tensor);
-      // TODO(chenweihang): adapt SelectedRows by xiaowei's design
-      ReMakePtenDenseTensorByArgDef(tmp_tensor, arg_def, dst);
-    } else {
-      ReMakePtenDenseTensorByArgDef(tensor.value(), arg_def, dst);
-    }
-  } else {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Unsupported shared input `%s` type now when call pt kernel.",
-        framework::ToTypeName(variable.Type())));
-  }
-}
-void ReMakePtenDenseTensorFromVar(framework::Variable* variable,
-                                  const pten::TensorArgDef& arg_def,
-                                  pten::DenseTensor* dst) {
-  // mutable_data before run kernel, to avoid share output form
-  // KernelContext to original tensor
-  if (variable->template IsType<framework::LoDTensor>()) {
-    auto* tensor = variable->template GetMutable<framework::LoDTensor>();
-    ReMakePtenDenseTensorByArgDef(*tensor, arg_def, dst);
-  } else if (variable->template IsType<framework::SelectedRows>()) {
-    auto* tensor = variable->template GetMutable<framework::SelectedRows>();
-    // TODO(chenweihang): adapt SelectedRows by xiaowei's design,
-    // here the row and height will lost in output!
-    ReMakePtenDenseTensorByArgDef(tensor->value(), arg_def, dst);
-  } else {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Unsupported shared output `%s` type now when call pt kernel.",
-        framework::ToTypeName(variable->Type())));
-  }
-}
 static bool IsSameAllocation(const std::shared_ptr<memory::Allocation>& a,
                             const std::shared_ptr<memory::Allocation>& b) {
  return a->ptr() == b->ptr() && a->size() == b->size() &&
@@ -489,5 +397,13 @@ void MakeVariableFromPtenTensor(pten::DenseTensor* src,
  }
 }
+void ResetTensorByArgDef(pten::DenseTensor* dst,
+                         const pten::TensorArgDef& arg_def) {
+  VLOG(5) << "ResetTensor by TensorArgDef.";
+  auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst);
+  meta->dtype = arg_def.dtype;
+  meta->layout = arg_def.layout;
+}
 }  // namespace experimental
 }  // namespace paddle
--- a/paddle/pten/api/lib/utils/tensor_utils.h
+++ b/paddle/pten/api/lib/utils/tensor_utils.h
@@ -67,20 +67,11 @@ void SharesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst);
 void ReMakePtenDenseTensor(const paddle::framework::Tensor& src,
                           pten::DenseTensor* dst);
-void ReMakePtenDenseTensorByArgDef(const paddle::framework::Tensor& src,
-                                   const pten::TensorArgDef& arg_def,
-                                   pten::DenseTensor* dst);
-void ReMakePtenDenseTensorFromVar(const framework::Variable& variable,
-                                  const pten::TensorArgDef& arg_def,
-                                  pten::DenseTensor* dst);
-void ReMakePtenDenseTensorFromVar(framework::Variable* variable,
-                                  const pten::TensorArgDef& arg_def,
-                                  pten::DenseTensor* dst);
 void MakeVariableFromPtenTensor(pten::DenseTensor* src,
                                framework::Variable* variable);
+void ResetTensorByArgDef(pten::DenseTensor* dst,
+                         const pten::TensorArgDef& arg_def);
 }  // namespace experimental
 }  // namespace paddle
--- a/paddle/pten/common/data_type.h
+++ b/paddle/pten/common/data_type.h
@@ -73,6 +73,7 @@ inline size_t SizeOf(DataType data_type) {
    case DataType::COMPLEX128:
      return 16;
    case DataType::UNDEFINED:
+      return 0;
    case DataType::NUM_DATA_TYPES:
      PD_THROW("Data type `",
               static_cast<int>(data_type),

--- a/paddle/pten/core/dense_tensor.cc
+++ b/paddle/pten/core/dense_tensor.cc
@@ -113,7 +113,8 @@ void* DenseTensor::mutable_data(size_t request_bytes) {
                          bytes));
    bytes = request_bytes;
  }
-  if (storage_->size() < bytes + meta_.offset || storage_->size() == 0) {
+  if (!storage_->data() || storage_->size() < bytes + meta_.offset ||
+      storage_->size() == 0) {
    VLOG(10) << "mutbale data realloc, original size: " << storage_->size()
             << ", new size: " << bytes;
    storage_->Realloc(bytes);

--- a/paddle/pten/core/kernel_context.cc
+++ b/paddle/pten/core/kernel_context.cc
@@ -16,20 +16,19 @@
 namespace pten {
-void KernelContext::EmplaceBackInput(std::shared_ptr<TensorBase> input) {
+void KernelContext::EmplaceBackInput(const TensorBase* input) {
  int index = inputs_.size();
-  inputs_.emplace_back(std::move(input));
+  inputs_.emplace_back(input);
  // Record the start and end index of the input
  input_range_.emplace_back(std::pair<int, int>(index, index + 1));
 }
-void KernelContext::EmplaceBackInputWithoutSetRange(
+void KernelContext::EmplaceBackInputWithoutSetRange(const TensorBase* input) {
-    std::shared_ptr<TensorBase> input) {
+  inputs_.emplace_back(input);
-  inputs_.emplace_back(std::move(input));
 }
 void KernelContext::EmplaceBackInputs(
-    paddle::SmallVector<std::shared_ptr<TensorBase>> inputs) {
+    paddle::SmallVector<const TensorBase*> inputs) {
  int index = inputs_.size();
  // Record the start and end index of the input
  input_range_.emplace_back(std::pair<int, int>(index, index + inputs.size()));
@@ -38,25 +37,23 @@ void KernelContext::EmplaceBackInputs(
                 std::make_move_iterator(inputs.end()));
 }
-void KernelContext::EmplaceBackOutput(std::shared_ptr<TensorBase> output) {
+void KernelContext::EmplaceBackOutput(TensorBase* output) {
  int index = outputs_.size();
-  outputs_.emplace_back(std::move(output));
+  outputs_.emplace_back(output);
  // Record the start and end index of the input
  output_range_.emplace_back(std::pair<int, int>(index, index + 1));
 }
-void KernelContext::EmplaceBackOutputWithoutSetRange(
+void KernelContext::EmplaceBackOutputWithoutSetRange(TensorBase* output) {
-    std::shared_ptr<TensorBase> output) {
+  outputs_.emplace_back(output);
-  outputs_.emplace_back(std::move(output));
 }
-void KernelContext::SetOutputWithoutSetRange(
+void KernelContext::SetOutputWithoutSetRange(int index, TensorBase* output) {
-    int index, std::shared_ptr<TensorBase> output) {
+  outputs_.at(index) = output;
-  outputs_.at(index) = std::move(output);
 }
 void KernelContext::EmplaceBackOutputs(
-    paddle::SmallVector<std::shared_ptr<TensorBase>> outputs) {
+    paddle::SmallVector<TensorBase*> outputs) {
  int index = outputs_.size();
  // Record the start and end index of the input
  output_range_.emplace_back(
@@ -116,19 +113,5 @@ std::pair<int, int>& KernelContext::MutableOutputRangeAt(size_t idx) {
 // Temporary method: For compatible with fluid Tensor and improve performance
 // Only deal with DenseTensor now
-void KernelContext::ClearData() {
+void KernelContext::ClearData() { attrs_.clear(); }
-  for (auto& in : inputs_) {
-    if (in) {
-      CompatibleDenseTensorUtils::ClearStorage(
-          static_cast<DenseTensor*>(in.get()));
-    }
-  }
-  for (auto& out : outputs_) {
-    if (out) {
-      CompatibleDenseTensorUtils::ClearStorage(
-          static_cast<DenseTensor*>(out.get()));
-    }
-  }
-  attrs_.clear();
-}
 }  // namespace pten
--- a/paddle/pten/core/kernel_context.h
+++ b/paddle/pten/core/kernel_context.h
@@ -51,21 +51,19 @@ class KernelContext {
    return static_cast<const CtxType&>(*dev_ctx_);
  }
-  void EmplaceBackInput(std::shared_ptr<TensorBase> input);
+  void EmplaceBackInput(const TensorBase* input);
-  void EmplaceBackInputWithoutSetRange(std::shared_ptr<TensorBase> input);
+  void EmplaceBackInputWithoutSetRange(const TensorBase* input);
-  void EmplaceBackInputs(
+  void EmplaceBackInputs(paddle::SmallVector<const TensorBase*> inputs);
-      paddle::SmallVector<std::shared_ptr<TensorBase>> inputs);
-  void EmplaceBackOutput(std::shared_ptr<TensorBase> output);
+  void EmplaceBackOutput(TensorBase* output);
-  void EmplaceBackOutputWithoutSetRange(std::shared_ptr<TensorBase> output);
+  void EmplaceBackOutputWithoutSetRange(TensorBase* output);
-  void SetOutputWithoutSetRange(int index, std::shared_ptr<TensorBase> output);
+  void EmplaceBackOutputs(paddle::SmallVector<TensorBase*> outputs);
-  void EmplaceBackOutputs(
+  void SetOutputWithoutSetRange(int index, TensorBase* output);
-      paddle::SmallVector<std::shared_ptr<TensorBase>> outputs);
  void EmplaceBackAttr(paddle::any attr);
@@ -90,16 +88,12 @@ class KernelContext {
                 : paddle::optional<const TensorType&>{paddle::none};
  }
-  std::shared_ptr<TensorBase>& MutableInputPtrAt(size_t idx) {
-    return inputs_.at(idx);
-  }
  template <typename TensorType>
  std::vector<TensorType> MoveInputsBetween(size_t start, size_t end) {
    std::vector<TensorType> v;
    for (size_t i = start; i < end; ++i) {
      auto t = std::dynamic_pointer_cast<TensorType>(inputs_.at(i));
-      v.emplace_back(std::move(*t.get()));
+      v.emplace_back(*t);
      inputs_.at(i) = nullptr;
    }
    return v;
@@ -109,21 +103,16 @@ class KernelContext {
  void AssignOutputRange(std::pair<int, int>&& range, size_t idx);
-  template <typename TensorType>
-  TensorType* MutableInputAt(size_t idx) {
-    return static_cast<TensorType*>(inputs_.at(idx).get());
-  }
  template <typename TensorType>
  TensorType* MutableOutputAt(size_t idx) {
-    return static_cast<TensorType*>(outputs_.at(idx).get());
+    return static_cast<TensorType*>(outputs_.at(idx));
  }
  template <typename TensorType>
  std::vector<TensorType*> MutableOutputBetween(size_t start, size_t end) {
    std::vector<TensorType*> v;
    for (size_t i = start; i < end; ++i) {
-      v.emplace_back(static_cast<TensorType*>(outputs_.at(i).get()));
+      v.emplace_back(static_cast<TensorType*>(outputs_.at(i)));
    }
    return v;
@@ -153,8 +142,8 @@ class KernelContext {
  // TODO(chenweihang): Tensor -> Tensor*, Tensor should by managed `scope`
  // Note: can't use API Tensor here, the inference don't use this API Tensor
-  paddle::SmallVector<std::shared_ptr<TensorBase>> inputs_;
+  paddle::SmallVector<const TensorBase*> inputs_;
-  paddle::SmallVector<std::shared_ptr<TensorBase>> outputs_;
+  paddle::SmallVector<TensorBase*> outputs_;
  paddle::SmallVector<paddle::any> attrs_;
  // Only contains input like list[Tensor] need `range`

--- a/paddle/pten/kernels/gpu/copy_kernel.cu
+++ b/paddle/pten/kernels/gpu/copy_kernel.cu
@@ -31,7 +31,7 @@ void Copy(const Context& dev_ctx,
          DenseTensor* dst) {
  auto* src_ptr = src.data();
  const auto& src_place = src.place();
-  const auto& dst_place = dst->place();
+  auto dst_place = dst->place();
  if (src_place == dst_place && paddle::platform::is_cpu_place(src_place)) {
    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
@@ -51,6 +51,7 @@ void Copy(const Context& dev_ctx,
    return;
  }
  VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
  CHECK(dst->layout() == src.layout());
  auto size = src.numel() *
@@ -208,6 +209,9 @@ void Copy(const Context& dev_ctx,
            "Context place dose not match the source and destination place."));
      }
    }
+  } else {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "Place type error. Please check the place of src and dst Tensor."));
  }
 }

--- a/paddle/pten/tests/api/scale_api.h
+++ b/paddle/pten/tests/api/scale_api.h
@@ -62,7 +62,7 @@ PADDLE_API Tensor scale_kernel_context(const Tensor& x,
  auto kernel_context = pten::KernelContext(dev_ctx);
  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
+  kernel_context.EmplaceBackInput(dense_x.get());
  kernel_context.EmplaceBackAttr(pten::Scalar(scale));
  kernel_context.EmplaceBackAttr(bias);
@@ -73,7 +73,7 @@ PADDLE_API Tensor scale_kernel_context(const Tensor& x,
      pten::make_intrusive<paddle::experimental::SharedStorage>(
          pten::TransToFluidPlace(kernel_backend)),
      std::move(out_meta));
-  kernel_context.EmplaceBackOutput(dense_out);
+  kernel_context.EmplaceBackOutput(dense_out.get());
  Tensor out;
  out.set_impl(dense_out);