diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
index 93cd0d1338f684a2d18803f0038c9fb8f53d7dc9..d71d78b5d9d0c13955c53ed5f1b7a8b73052cf4b 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
@@ -175,7 +175,7 @@ TEST(Benchmark, EagerIntermediateMLPCPU) {
   }
 }
 
-USE_OP(scale);
+USE_OP_ITSELF(scale);
 USE_OP(elementwise_add);
 USE_OP(matmul_v2);
 USE_OP(reduce_sum);
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
index 2df44bfcab57fcda293841af8e3a89fa6290499d..640ee0152efc4fa74ba59dd2e8803e26bdb91fa5 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
@@ -185,7 +185,7 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) {
   }
 }
 
-USE_OP(scale);
+USE_OP_ITSELF(scale);
 USE_OP(matmul_v2);
 USE_OP(reduce_sum);
 USE_OP(reduce_sum_grad);
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
index b2a96468ece22c979cbe022531bff9e7739e5153..c2f0479460064e05fc917ec432a7384e43e73cf3 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
@@ -211,7 +211,7 @@ TEST(Benchmark, FluidMLPCPU) {
 }  // namespace imperative
 }  // namespace paddle
 
-USE_OP(scale);
+USE_OP_ITSELF(scale);
 USE_OP(elementwise_add);
 USE_OP(matmul_v2);
 USE_OP(reduce_sum);
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
index 7f8b845b0703b70523c8732737d182357a64cf83..250005e31150c3c9d83d3d094ccb4e00b2de7429 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
@@ -245,7 +245,7 @@ TEST(Benchmark, FluidMLPCUDA) {
 }  // namespace imperative
 }  // namespace paddle
 
-USE_OP(scale);
+USE_OP_ITSELF(scale);
 USE_OP(matmul_v2);
 USE_OP(reduce_sum);
 USE_OP(reduce_sum_grad);
diff --git a/paddle/fluid/framework/heter_pipeline_trainer_test.cc b/paddle/fluid/framework/heter_pipeline_trainer_test.cc
index 417c7685bcbeb44b2db2b3d849d3915f351cf002..a605d5d6811eb08721a1f220ccb81cafb3babdb6 100644
--- a/paddle/fluid/framework/heter_pipeline_trainer_test.cc
+++ b/paddle/fluid/framework/heter_pipeline_trainer_test.cc
@@ -26,7 +26,7 @@
 #define _LINUX
 #endif
 
-USE_OP(scale);
+USE_OP_ITSELF(scale);
 USE_NO_KERNEL_OP(heter_listen_and_serv);
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index e1d7190a9e418ab5ae049707e2d2d92d78a896be..652286ab2666e6253173f6b7d5c3751a22ee788c 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -78,7 +78,6 @@ class InferShapeArgumentMappingContext : public pten::ArgumentMappingContext {
   const InferShapeContext& ctx_;
 };
 
-// TODO(chenweihang): Support SelectedRows later
 // TODO(chenweihang): Support TensorArray later
 class CompatMetaTensor : public pten::MetaTensor {
  public:
@@ -104,7 +103,14 @@ class CompatMetaTensor : public pten::MetaTensor {
   DDim dims() const override {
     if (is_runtime_) {
       auto* var = BOOST_GET_CONST(Variable*, var_);
-      return var->Get<LoDTensor>().dims();
+      if (var->IsType<pten::DenseTensor>()) {
+        return var->Get<pten::DenseTensor>().dims();
+      } else if (var->IsType<pten::SelectedRows>()) {
+        return var->Get<pten::SelectedRows>().dims();
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Currently, only can get dims from DenseTensor or SelectedRows."));
+      }
     } else {
       auto* var = BOOST_GET_CONST(VarDesc*, var_);
       return make_ddim(var->GetShape());
@@ -114,7 +120,14 @@ class CompatMetaTensor : public pten::MetaTensor {
   pten::DataType dtype() const override {
     if (is_runtime_) {
       auto* var = BOOST_GET_CONST(Variable*, var_);
-      return var->Get<LoDTensor>().dtype();
+      if (var->IsType<pten::DenseTensor>()) {
+        return var->Get<pten::DenseTensor>().dtype();
+      } else if (var->IsType<pten::SelectedRows>()) {
+        return var->Get<pten::SelectedRows>().dtype();
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Currently, only can get dtype from DenseTensor or SelectedRows."));
+      }
     } else {
       auto* var = BOOST_GET_CONST(VarDesc*, var_);
       return pten::TransToPtenDataType(var->GetDataType());
@@ -135,10 +148,16 @@ class CompatMetaTensor : public pten::MetaTensor {
   void set_dims(const DDim& dims) override {
     if (is_runtime_) {
       auto* var = BOOST_GET(Variable*, var_);
-      LoDTensor* tensor = var->GetMutable<LoDTensor>();
-      pten::DenseTensorUtils::GetMutableMeta(
-          static_cast<pten::DenseTensor*>(tensor))
-          ->dims = dims;
+      if (var->IsType<pten::DenseTensor>()) {
+        auto* tensor = var->GetMutable<pten::DenseTensor>();
+        pten::DenseTensorUtils::GetMutableMeta(tensor)->dims = dims;
+      } else if (var->IsType<pten::SelectedRows>()) {
+        auto* tensor = var->GetMutable<pten::SelectedRows>()->mutable_value();
+        pten::DenseTensorUtils::GetMutableMeta(tensor)->dims = dims;
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Currently, only can set dims from DenseTensor or SelectedRows."));
+      }
     } else {
       auto* var = BOOST_GET(VarDesc*, var_);
       var->SetShape(vectorize(dims));
@@ -148,10 +167,16 @@ class CompatMetaTensor : public pten::MetaTensor {
   void set_dtype(pten::DataType dtype) override {
     if (is_runtime_) {
       auto* var = BOOST_GET(Variable*, var_);
-      LoDTensor* tensor = var->GetMutable<LoDTensor>();
-      pten::DenseTensorUtils::GetMutableMeta(
-          static_cast<pten::DenseTensor*>(tensor))
-          ->dtype = dtype;
+      if (var->IsType<pten::DenseTensor>()) {
+        auto* tensor = var->GetMutable<pten::DenseTensor>();
+        pten::DenseTensorUtils::GetMutableMeta(tensor)->dtype = dtype;
+      } else if (var->IsType<pten::SelectedRows>()) {
+        auto* tensor = var->GetMutable<pten::SelectedRows>()->mutable_value();
+        pten::DenseTensorUtils::GetMutableMeta(tensor)->dtype = dtype;
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Currently, only can set dtype from DenseTensor or SelectedRows."));
+      }
     } else {
       auto* var = BOOST_GET(VarDesc*, var_);
       var->SetDataType(pten::TransToProtoVarType(dtype));
@@ -174,11 +199,14 @@ class CompatMetaTensor : public pten::MetaTensor {
   void share_lod(const MetaTensor& meta_tensor) override {
     if (is_runtime_) {
       auto* var = BOOST_GET(Variable*, var_);
-      LoDTensor* tensor = var->GetMutable<LoDTensor>();
-      pten::DenseTensorUtils::GetMutableMeta(
-          static_cast<pten::DenseTensor*>(tensor))
-          ->lod =
-          static_cast<const CompatMetaTensor&>(meta_tensor).GetRuntimeLoD();
+      if (var->IsType<pten::DenseTensor>()) {
+        auto* tensor = var->GetMutable<pten::DenseTensor>();
+        pten::DenseTensorUtils::GetMutableMeta(tensor)->lod =
+            static_cast<const CompatMetaTensor&>(meta_tensor).GetRuntimeLoD();
+      } else {
+        // NOTE(chenweihang): do nothing
+        // only LoDTensor need to share lod
+      }
     } else {
       auto* var = BOOST_GET(VarDesc*, var_);
       var->SetLoDLevel(static_cast<const CompatMetaTensor&>(meta_tensor)
@@ -191,7 +219,21 @@ class CompatMetaTensor : public pten::MetaTensor {
     set_dtype(meta_tensor.dtype());
     // VarDesc doesn't contains layout, so we cannot share layout
     // set_layout(meta_tensor.layout());
+
+    // special case 1: share lod of LoDTensor
     share_lod(meta_tensor);
+
+    // special case 2: share height and rows of SelectedRows in runtime
+    if (is_runtime_) {
+      auto* var = BOOST_GET(Variable*, var_);
+      if (var->IsType<pten::SelectedRows>()) {
+        auto* selected_rows = var->GetMutable<pten::SelectedRows>();
+        auto& input_selected_rows =
+            static_cast<const CompatMetaTensor&>(meta_tensor).GetSelectedRows();
+        selected_rows->set_rows(input_selected_rows.rows());
+        selected_rows->set_height(input_selected_rows.height());
+      }
+    }
   }
 
  private:
@@ -199,11 +241,23 @@ class CompatMetaTensor : public pten::MetaTensor {
     auto* var = BOOST_GET_CONST(Variable*, var_);
     return var->Get<LoDTensor>().lod();
   }
+
   int32_t GetCompileTimeLoD() const {
     auto* var = BOOST_GET_CONST(VarDesc*, var_);
     return var->GetLoDLevel();
   }
 
+  const pten::SelectedRows& GetSelectedRows() const {
+    PADDLE_ENFORCE_EQ(is_runtime_, true,
+                      platform::errors::Unavailable(
+                          "Only can get Tensor from MetaTensor in rumtime."));
+    auto* var = BOOST_GET_CONST(Variable*, var_);
+    PADDLE_ENFORCE_EQ(var->IsType<pten::SelectedRows>(), true,
+                      platform::errors::Unavailable(
+                          "The Tensor in MetaTensor is not SelectedRows."));
+    return var->Get<pten::SelectedRows>();
+  }
+
   InferShapeVarPtr var_;
   bool is_runtime_;
 };
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
index f410171f9989654065fdc78281b94075d6c2c94e..746d90cef917cdb8c4740adf7dff3438c2ca1249 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
@@ -21,7 +21,7 @@
 #include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/framework/program_desc.h"
 
-USE_OP(scale);
+USE_OP_ITSELF(scale);
 USE_OP(elementwise_mul);
 USE_OP(elementwise_add);
 USE_OP(elementwise_add_grad);
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index ef9c5b921349266a56247dcced4066e71ebd15d6..53cc741d25664b175c12ee13ab2dc0c8330e28bc 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -393,7 +393,8 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
     platform::RecordEvent infershape_event("InferShape");
     // If it is OperatorBase, InferShape do nothing.
     if (op_with_kernel != nullptr)
-      op_with_kernel->InferShape(instr_node.InnerInferShapeContext().get());
+      op_with_kernel->Info().infer_shape_(
+          instr_node.InnerInferShapeContext().get());
   }
 
   if (op_with_kernel != nullptr &&
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 50c315bf03848966d196dd86d2122b346c9e88d4..5ab14a1daba226f02e92db4d0d172bf2ac549646 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1998,16 +1998,17 @@ void OperatorWithKernel::BuildPtenKernelContext(
     size_t end_idx = start_idx + ins_vector.size();
 
     for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
-      const framework::Tensor* tensor_in = nullptr;
+      const pten::TensorBase* tensor_in = nullptr;
       auto* var = ins_vector[offset];
-      if (var->IsType<framework::LoDTensor>()) {
-        tensor_in = &(var->Get<framework::LoDTensor>());
+      if (var->IsType<pten::DenseTensor>()) {
+        tensor_in = &(var->Get<pten::DenseTensor>());
+      } else if (var->IsType<pten::SelectedRows>()) {
+        tensor_in = &(var->Get<pten::SelectedRows>());
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Unsupported input `%s` type when call pt kernel.",
             framework::ToTypeName(var->Type())));
-      }  // TODO(zyfncg): Add support for SelectedRows
-
+      }
       pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
     }
     pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i);
@@ -2021,17 +2022,20 @@ void OperatorWithKernel::BuildPtenKernelContext(
     size_t end_idx = start_idx + outs_vector.size();
 
     for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
-      framework::Tensor* tensor_out = nullptr;
+      pten::TensorBase* tensor_out = nullptr;
       auto* var = outs_vector[offset];
-      if (var->template IsType<framework::LoDTensor>()) {
-        tensor_out = var->template GetMutable<framework::LoDTensor>();
+      if (var->template IsType<pten::DenseTensor>()) {
+        tensor_out = var->template GetMutable<pten::DenseTensor>();
+      } else if (var->template IsType<pten::SelectedRows>()) {
+        tensor_out = var->template GetMutable<pten::SelectedRows>();
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Unsupported output `%s` type when call pt kernel.",
             framework::ToTypeName(var->Type())));
-      }  // TODO(zyfncg): Add support for SelectedRows
+      }
 
-      experimental::ResetTensorByArgDef(tensor_out, output_defs.at(i));
+      experimental::ResetTensorDtypeAndLayoutByArgDef(tensor_out,
+                                                      output_defs.at(i));
       SetAllocationForOutputTenosr(
           tensor_out, pten::TransToFluidPlace(output_defs.at(i).backend));
 
diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc
index 1a27f971fa082f894ac422fcfb8762ab7fb46725..265bd99593dc427fbde6396833198c47356f9d02 100644
--- a/paddle/fluid/framework/pten_utils.cc
+++ b/paddle/fluid/framework/pten_utils.cc
@@ -207,21 +207,40 @@ void InitDefaultKernelSignatureMap() {
   });
 }
 
-void SetAllocationForOutputTenosr(pten::DenseTensor* tensor,
+static void SetAllocationForUninitializedDenseTensor(
+    pten::DenseTensor* dense_tensor, const platform::Place& place) {
+  int dtype_size = dense_tensor->dtype() == DataType::UNDEFINED
+                       ? 0
+                       : experimental::SizeOf(dense_tensor->dtype());
+  int64_t numels = product(dense_tensor->dims());
+  numels = numels < 0 ? 0 : numels;
+  auto tmp_allocation_ptr = memory::Alloc(place, numels * dtype_size);
+  auto& deleter = tmp_allocation_ptr.get_deleter();
+  auto* allocation_ptr = tmp_allocation_ptr.release();
+  auto shared_allocation =
+      std::shared_ptr<pten::Allocation>(allocation_ptr, deleter);
+
+  dense_tensor->ResetHolder(shared_allocation);
+}
+
+void SetAllocationForOutputTenosr(pten::TensorBase* tensor,
                                   const platform::Place& place) {
-  if (!tensor->IsInitialized() || !(tensor->place() == place)) {
-    int dtype_size = tensor->dtype() == DataType::UNDEFINED
-                         ? 0
-                         : experimental::SizeOf(tensor->dtype());
-    int64_t numels = product(tensor->dims());
-    numels = numels < 0 ? 0 : numels;
-    auto tmp_allocation_ptr = memory::Alloc(place, numels * dtype_size);
-    auto& deleter = tmp_allocation_ptr.get_deleter();
-    auto* allocation_ptr = tmp_allocation_ptr.release();
-    auto shared_allocation =
-        std::shared_ptr<pten::Allocation>(allocation_ptr, deleter);
-
-    tensor->ResetHolder(shared_allocation);
+  if (pten::DenseTensor::classof(tensor)) {
+    auto* dense_tensor = static_cast<pten::DenseTensor*>(tensor);
+    if (!dense_tensor->IsInitialized() || !(dense_tensor->place() == place)) {
+      SetAllocationForUninitializedDenseTensor(dense_tensor, place);
+    }
+  } else if (pten::SelectedRows::classof(tensor)) {
+    auto* selected_rows = static_cast<pten::SelectedRows*>(tensor);
+    if (!selected_rows->value().IsInitialized() ||
+        !(selected_rows->place() == place)) {
+      SetAllocationForUninitializedDenseTensor(selected_rows->mutable_value(),
+                                               place);
+    }
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported tensor type is received when setting allocation for "
+        "output tensor."));
   }
 }
 
diff --git a/paddle/fluid/framework/pten_utils.h b/paddle/fluid/framework/pten_utils.h
index ae0388079d280a3c3ffa2637f6114a58141387ed..44f5ee9f9d8c0c63dcc09947c6e23b786fb4932b 100644
--- a/paddle/fluid/framework/pten_utils.h
+++ b/paddle/fluid/framework/pten_utils.h
@@ -63,7 +63,7 @@ class KernelArgsNameMaker {
 
 void InitDefaultKernelSignatureMap();
 
-void SetAllocationForOutputTenosr(pten::DenseTensor* tensor,
+void SetAllocationForOutputTenosr(pten::TensorBase* tensor,
                                   const platform::Place& place);
 
 // TODO(Wilber): support others device context.
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 719036742d3f76985a098116945e00e8aa345aa1..8775f715bfb202f1b07a4ccacb113ff97d71fada 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -29,6 +29,9 @@
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/imperative/var_helper.h"
 
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/selected_rows.h"
+
 DECLARE_bool(use_mkldnn);
 
 namespace paddle {
@@ -262,7 +265,17 @@ void BuildDygraphPtenKernelContext(
     size_t end_idx = start_idx + ins_vector.size();
 
     for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
-      const auto* tensor_in = GetTensorFromVar(ins_vector[offset]->Var());
+      const pten::TensorBase* tensor_in = nullptr;
+      auto& var = ins_vector[offset]->Var();
+      if (var.template IsType<pten::DenseTensor>()) {
+        tensor_in = &(var.template Get<pten::DenseTensor>());
+      } else if (var.template IsType<pten::SelectedRows>()) {
+        tensor_in = &(var.template Get<pten::SelectedRows>());
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported input `%s` type when call pt kernel.",
+            framework::ToTypeName(var.Type())));
+      }
       kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in);
     }
     kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
@@ -287,17 +300,21 @@ void BuildDygraphPtenKernelContext(
         kernel_ctx->EmplaceBackOutputWithoutSetRange({nullptr});
         continue;
       }
+
+      pten::TensorBase* tensor_out = nullptr;
       auto* var = outs_vector[offset]->MutableVar();
-      framework::Tensor* tensor_out = nullptr;
-      if (var->template IsType<framework::LoDTensor>()) {
-        tensor_out = var->template GetMutable<framework::LoDTensor>();
+      if (var->template IsType<pten::DenseTensor>()) {
+        tensor_out = var->template GetMutable<pten::DenseTensor>();
+      } else if (var->template IsType<pten::SelectedRows>()) {
+        tensor_out = var->template GetMutable<pten::SelectedRows>();
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Unsupported output `%s` type when call pt kernel.",
             framework::ToTypeName(var->Type())));
-      }  // TODO(zyfncg): Add support for SelectedRows
+      }
 
-      experimental::ResetTensorByArgDef(tensor_out, output_defs.at(i));
+      experimental::ResetTensorDtypeAndLayoutByArgDef(tensor_out,
+                                                      output_defs.at(i));
       framework::SetAllocationForOutputTenosr(
           tensor_out, pten::TransToFluidPlace(output_defs.at(i).backend));
 
diff --git a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
index a195b8dee3c2f5580be5f7c094194576b9eccb88..ddc6287011bcff9f12065b005faa315ffeec948a 100644
--- a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
+++ b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
@@ -33,7 +33,7 @@ using MultiVarMsg = ::paddle::distributed::MultiVariableMessage;
 using VarMsg = ::paddle::distributed::VariableMessage;
 DECLARE_double(eager_delete_tensor_gb);
 
-USE_OP(scale);
+USE_OP_ITSELF(scale);
 USE_NO_KERNEL_OP(heter_listen_and_serv);
 
 framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) {
diff --git a/paddle/fluid/operators/pscore/heter_server_test.cc b/paddle/fluid/operators/pscore/heter_server_test.cc
index 7914e9d9a1058ab15a08e3b0dee8725e7a74bb38..f7e8ae1c09d031d761d43481aa2a955f683cf956 100644
--- a/paddle/fluid/operators/pscore/heter_server_test.cc
+++ b/paddle/fluid/operators/pscore/heter_server_test.cc
@@ -29,7 +29,7 @@ namespace distributed = paddle::distributed;
 using MultiVarMsg = ::paddle::distributed::MultiVariableMessage;
 using VarMsg = ::paddle::distributed::VariableMessage;
 
-USE_OP(scale);
+USE_OP_ITSELF(scale);
 
 std::shared_ptr<distributed::HeterServer> b_rpc_service;
 
diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc
index 07fe44601ca08831a9e4372d04c097a8e56644f2..077eecb72a96427c2f99c5e66739820c8a519d60 100644
--- a/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc
@@ -31,7 +31,7 @@ namespace distributed = paddle::distributed;
 using MultiVarMsg = ::paddle::distributed::MultiVariableMessage;
 using VarMsg = ::paddle::distributed::VariableMessage;
 
-USE_OP(scale);
+USE_OP_ITSELF(scale);
 USE_OP(send_and_recv);
 
 std::shared_ptr<distributed::HeterServer> b_rpc_service;
diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
index 2c701bdae76010cc1b1e1eb341f30753269269bc..b7049019bc4bef6f8e5c392c4e36735421108d1a 100644
--- a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
@@ -35,7 +35,7 @@ namespace memory = paddle::memory;
 using MultiVarMsg = ::paddle::distributed::MultiVariableMessage;
 using VarMsg = ::paddle::distributed::VariableMessage;
 
-USE_OP(scale);
+USE_OP_ITSELF(scale);
 USE_OP(send_and_recv);
 
 std::shared_ptr<distributed::HeterServer> b_rpc_service2;
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index 912af2c85b2cfa8fd4372101285241699633503b..ccf3afe29c73e182bfb6f2b8ab5d642888102158 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -12,49 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/scale_op.h"
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class OpDesc;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
+#include "paddle/pten/core/infermeta_utils.h"
+#include "paddle/pten/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
 
 class ScaleOp : public framework::OperatorWithKernel {
  public:
-  ScaleOp(const std::string &type, const framework::VariableNameMap &inputs,
-          const framework::VariableNameMap &outputs,
-          const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "scale");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "scale");
-
-    if (ctx->IsRuntime() && ctx->HasInput("ScaleTensor")) {
-      auto scale = ctx->Inputs("ScaleTensor");
-      PADDLE_ENFORCE_EQ(scale.size(), 1,
-                        platform::errors::InvalidArgument(
-                            "Input(ScaleTensor) size must be 1, "
-                            "but received size is %d.",
-                            scale.size()));
-    }
-
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
+  using framework::OperatorWithKernel::OperatorWithKernel;
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -150,32 +120,10 @@ DECLARE_INPLACE_OP_INFERER(ScaleOpInplaceInferer, {"X", "Out"});
 
 namespace ops = paddle::operators;
 
+DELCARE_INFER_SHAPE_FUNCTOR(scale, ScaleInferShapeFunctor,
+                            PT_INFER_META(pten::UnchangedInferMeta));
 REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker,
                   ops::ScaleGradMaker<paddle::framework::OpDesc>,
                   ops::ScaleGradMaker<paddle::imperative::OpBase>,
-                  ops::ScaleOpVarTypeInference, ops::ScaleOpInplaceInferer);
-REGISTER_OP_CPU_KERNEL(
-    scale, ops::ScaleKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ScaleKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ScaleKernel<paddle::platform::CPUDeviceContext,
-                     paddle::platform::bfloat16>,
-    ops::ScaleKernel<paddle::platform::CPUDeviceContext, uint8_t>,
-    ops::ScaleKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::ScaleKernel<paddle::platform::CPUDeviceContext, int16_t>,
-    ops::ScaleKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ScaleKernel<paddle::platform::CPUDeviceContext, int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(
-    scale,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, float>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, double>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
-                                   uint8_t>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
-                                   int16_t>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, int>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
-                                   int64_t>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
-                                   paddle::platform::float16>);
+                  ScaleInferShapeFunctor, ops::ScaleOpVarTypeInference,
+                  ops::ScaleOpInplaceInferer);
diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h
deleted file mode 100644
index 8ce0b7984cc0512b630d03d4ec2205d096c0c826..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/scale_op.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/pten_utils.h"
-
-// only can include the headers in paddle/top/api dirs
-#include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/kernels/scale_kernel.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-static inline T GetAttrFromTensor(const framework::Tensor* tensor) {
-  const auto* tensor_data = tensor->data<T>();
-  framework::Tensor cpu_tensor;
-  if (platform::is_gpu_place(tensor->place()) ||
-      platform::is_npu_place(tensor->place())) {
-    paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(),
-                                      &cpu_tensor);
-    tensor_data = cpu_tensor.data<T>();
-  }
-  return tensor_data[0];
-}
-
-// See Note [ Why still keep the original kernel implementation? ]
-template <typename DeviceContext, typename T>
-class ScaleKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& ctx) const {
-    auto* in_var = ctx.InputVar("X");
-
-    auto bias = ctx.Attr<float>("bias");
-    auto bias_after_scale = ctx.Attr<bool>("bias_after_scale");
-    auto scale = ctx.Attr<float>("scale");
-    auto* out_var = ctx.OutputVar("Out");
-
-    if (ctx.HasInput("ScaleTensor")) {
-      auto* scale_tensor = ctx.Input<framework::Tensor>("ScaleTensor");
-      scale = static_cast<float>(GetAttrFromTensor<T>(scale_tensor));
-    }
-
-    auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var);
-    auto* out =
-        framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var);
-    out->mutable_data<T>(in->place());
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
-
-    // call new kernel
-    if (in_var->IsType<pten::SelectedRows>()) {
-      pten::ScaleSR<T>(
-          static_cast<const typename framework::ConvertToPtenContext<
-              DeviceContext>::TYPE&>(dev_ctx),
-          in_var->Get<pten::SelectedRows>(), scale, bias, bias_after_scale,
-          out_var->GetMutable<pten::SelectedRows>());
-    } else {
-      pten::ScaleKernel<T>(
-          static_cast<const typename framework::ConvertToPtenContext<
-              DeviceContext>::TYPE&>(dev_ctx),
-          *in, scale, bias, bias_after_scale, out);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/scale_op_mlu.cc b/paddle/fluid/operators/scale_op_mlu.cc
index 1e1187845ce477f939e8cf21650076c875861f3d..d027ac0d3317f0495462e3ec167b94ab89608382 100644
--- a/paddle/fluid/operators/scale_op_mlu.cc
+++ b/paddle/fluid/operators/scale_op_mlu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/scale_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/scale_op_npu.cc b/paddle/fluid/operators/scale_op_npu.cc
index 7d84d56c2b3b870977215a90799062957a36d535..807ad7509e57389bfd47a25ae48c3d72f6a47d28 100644
--- a/paddle/fluid/operators/scale_op_npu.cc
+++ b/paddle/fluid/operators/scale_op_npu.cc
@@ -12,12 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/scale_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
 namespace operators {
 
+template <typename T>
+static inline T GetAttrFromTensor(const framework::Tensor* tensor) {
+  const auto* tensor_data = tensor->data<T>();
+  framework::Tensor cpu_tensor;
+  if (platform::is_gpu_place(tensor->place()) ||
+      platform::is_npu_place(tensor->place())) {
+    paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(),
+                                      &cpu_tensor);
+    tensor_data = cpu_tensor.data<T>();
+  }
+  return tensor_data[0];
+}
+
 template <typename T>
 class ScaleNPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/scale_op_xpu.cc b/paddle/fluid/operators/scale_op_xpu.cc
index 026a5dda89b5f07423090cb83bfb73e706cba7b7..2430007de0a5c25d17247ecc176366d20c3bad80 100644
--- a/paddle/fluid/operators/scale_op_xpu.cc
+++ b/paddle/fluid/operators/scale_op_xpu.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/scale_op.h"
 #include <string>
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/pten/kernels/scale_kernel.h"
 
 namespace paddle {
diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc
index 8fdfc29540bfada65c1b32137eca018a293459b1..e4c20aa971b952ad3cdd0bb54c7ea446fc9998f2 100644
--- a/paddle/pten/api/lib/utils/tensor_utils.cc
+++ b/paddle/pten/api/lib/utils/tensor_utils.cc
@@ -198,12 +198,25 @@ pten::ScalarArray MakePtenScalarArrayFromVarList(
   return {vector_data};
 }
 
-void ResetTensorByArgDef(pten::DenseTensor* dst,
-                         const pten::TensorArgDef& arg_def) {
+void ResetTensorDtypeAndLayoutByArgDef(pten::TensorBase* dst,
+                                       const pten::TensorArgDef& arg_def) {
   VLOG(5) << "ResetTensor by TensorArgDef.";
-  auto* meta = pten::DenseTensorUtils::GetMutableMeta(dst);
-  meta->dtype = arg_def.dtype;
-  meta->layout = arg_def.layout;
+  if (pten::DenseTensor::classof(dst)) {
+    auto* dense_t = static_cast<pten::DenseTensor*>(dst);
+    auto* meta = pten::DenseTensorUtils::GetMutableMeta(dense_t);
+    meta->dtype = arg_def.dtype;
+    meta->layout = arg_def.layout;
+  } else if (pten::SelectedRows::classof(dst)) {
+    auto* selected_rows = static_cast<pten::SelectedRows*>(dst);
+    auto* meta =
+        pten::DenseTensorUtils::GetMutableMeta(selected_rows->mutable_value());
+    meta->dtype = arg_def.dtype;
+    meta->layout = arg_def.layout;
+  } else {
+    PADDLE_THROW(pten::errors::Unimplemented(
+        "Unsupported tensor type is received when reseting tensor dtype and "
+        "layout by argument definition."));
+  }
 }
 
 }  // namespace experimental
diff --git a/paddle/pten/api/lib/utils/tensor_utils.h b/paddle/pten/api/lib/utils/tensor_utils.h
index 1ffcc7d4d5b70a11f288efbd8a8c46a716fb42dc..1e2d8b74db84941f970c0613fad4fa488f813053 100644
--- a/paddle/pten/api/lib/utils/tensor_utils.h
+++ b/paddle/pten/api/lib/utils/tensor_utils.h
@@ -45,8 +45,8 @@ pten::ScalarArray MakePtenScalarArrayFromVar(
 pten::ScalarArray MakePtenScalarArrayFromVarList(
     const std::vector<framework::Variable*>& variable_list);
 
-void ResetTensorByArgDef(pten::DenseTensor* dst,
-                         const pten::TensorArgDef& arg_def);
+void ResetTensorDtypeAndLayoutByArgDef(pten::TensorBase* dst,
+                                       const pten::TensorArgDef& arg_def);
 
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/pten/core/kernel_context.cc b/paddle/pten/core/kernel_context.cc
index 34e9fabbe67e6b5fa219dae669f39b7a0877ecbd..9002003c9fa4db2aadd680973da654984b4ad982 100644
--- a/paddle/pten/core/kernel_context.cc
+++ b/paddle/pten/core/kernel_context.cc
@@ -48,10 +48,6 @@ void KernelContext::EmplaceBackOutputWithoutSetRange(TensorBase* output) {
   outputs_.emplace_back(output);
 }
 
-void KernelContext::SetOutputWithoutSetRange(int index, TensorBase* output) {
-  outputs_.at(index) = output;
-}
-
 void KernelContext::EmplaceBackOutputs(
     paddle::SmallVector<TensorBase*> outputs) {
   int index = outputs_.size();
@@ -103,15 +99,4 @@ const std::pair<int, int>& KernelContext::OutputRangeAt(size_t idx) const {
   return output_range_.at(idx);
 }
 
-std::pair<int, int>& KernelContext::MutableInputRangeAt(size_t idx) {
-  return input_range_[idx];
-}
-
-std::pair<int, int>& KernelContext::MutableOutputRangeAt(size_t idx) {
-  return output_range_[idx];
-}
-
-// Temporary method: For compatible with fluid Tensor and improve performance
-// Only deal with DenseTensor now
-void KernelContext::ClearData() { attrs_.clear(); }
 }  // namespace pten
diff --git a/paddle/pten/core/kernel_context.h b/paddle/pten/core/kernel_context.h
index 876c98e3bcf6b4dea82dbb9d86b1aa28348a5d1c..25a1f2ed9bb16b82c48184d565954a812639598a 100644
--- a/paddle/pten/core/kernel_context.h
+++ b/paddle/pten/core/kernel_context.h
@@ -26,10 +26,8 @@
 
 namespace pten {
 
-using DeviceContext = pten::DeviceContext;
-
 /**
- * Note: KernelContext doesn't manage the life if DeviceContext and Tensor
+ * Note: KernelContext doesn't manage the life of DeviceContext and Tensor
  *
  * Note: KernelContext does not couple the concept of framework,
  *       its constructor can only take the members it needs as parameters,
@@ -59,17 +57,15 @@ class KernelContext {
 
   void EmplaceBackOutputs(paddle::SmallVector<TensorBase*> outputs);
 
-  void SetOutputWithoutSetRange(int index, TensorBase* output);
-
   void EmplaceBackAttr(paddle::any attr);
 
   const std::pair<int, int>& InputRangeAt(size_t idx) const;
 
   const std::pair<int, int>& OutputRangeAt(size_t idx) const;
 
-  std::pair<int, int>& MutableInputRangeAt(size_t idx);
+  void AssignInputRange(std::pair<int, int>&& range, size_t idx);
 
-  std::pair<int, int>& MutableOutputRangeAt(size_t idx);
+  void AssignOutputRange(std::pair<int, int>&& range, size_t idx);
 
   template <typename TensorType>
   const TensorType& InputAt(size_t idx) const {
@@ -90,15 +86,11 @@ class KernelContext {
     for (size_t i = start; i < end; ++i) {
       auto t = static_cast<const TensorType*>(inputs_.at(i));
       v.emplace_back(*t);
-      inputs_.at(i) = nullptr;
+      inputs_[i] = nullptr;
     }
     return v;
   }
 
-  void AssignInputRange(std::pair<int, int>&& range, size_t idx);
-
-  void AssignOutputRange(std::pair<int, int>&& range, size_t idx);
-
   template <typename TensorType>
   TensorType* MutableOutputAt(size_t idx) {
     return static_cast<TensorType*>(outputs_.at(idx));
@@ -110,7 +102,6 @@ class KernelContext {
     for (size_t i = start; i < end; ++i) {
       v.emplace_back(static_cast<TensorType*>(outputs_.at(i)));
     }
-
     return v;
   }
 
@@ -124,25 +115,17 @@ class KernelContext {
     }
   }
 
-  // Temporary method: For compatible with fluid Tensor and improve performance
-  // Only deal with DenseTensor now
-  void ClearData();
-
   size_t InputsSize() const { return inputs_.size(); }
   size_t OutputsSize() const { return outputs_.size(); }
   size_t AttrsSize() const { return attrs_.size(); }
 
  private:
-  // DeviceContext base class
   DeviceContext* dev_ctx_;
 
-  // TODO(chenweihang): Tensor -> Tensor*, Tensor should by managed `scope`
-  // Note: can't use API Tensor here, the inference don't use this API Tensor
   paddle::SmallVector<const TensorBase*> inputs_;
   paddle::SmallVector<TensorBase*> outputs_;
   paddle::SmallVector<paddle::any> attrs_;
 
-  // Only contains input like list[Tensor] need `range`
   paddle::SmallVector<std::pair<int, int>> input_range_;
   paddle::SmallVector<std::pair<int, int>> output_range_;
 };
diff --git a/paddle/pten/ops/compat/scale_sig.cc b/paddle/pten/ops/compat/scale_sig.cc
index 5ce159a5d84c9faba760cd7b8605f2bd0734c53f..279be3df54a36b0707fe43478dc94721a5f18c1f 100644
--- a/paddle/pten/ops/compat/scale_sig.cc
+++ b/paddle/pten/ops/compat/scale_sig.cc
@@ -16,9 +16,37 @@ limitations under the License. */
 
 namespace pten {
 
+/**
+ * Note [ Why does the ArgumentMapping function need to be so complicated? ]
+ *
+ * In order to meet the requirements of infrt, the function used to match Op
+ * and Kernel parameters, need to be placed in pten as a compatible component,
+ * and does not depend on fluid.
+ *
+ * Because infrt not only needs to dynamically call this argument mapping
+ * function at runtime, but also needs to statically declare all possible
+ * results of the function before running without any information.
+ *
+ * The infrt declare like:
+ *
+ * def PDKEL_Reshape_to_CPU : Pat<
+ *     (PD_ReshapeOp $x, $shape_tensor， $shape_attr), // OpMaker arguements
+ *     (PDKEL_ReshapeKernelAttr $x, fn($shape_attr)>;  // Kernel arguments
+ * def PDKEL_Reshape_to_CPU : Pat<
+ *     (PD_ReshapeOp $x, $shape_tensor， $shape_attr),
+ *     (PDKEL_ReshapeKernelAttr $x, fn($shape_tensor)>;
+ *
+ * Therefore, we need to write out each result of the argument mapping function,
+ * like `KernelSignature("full", {}, {"ShapeTensor", "value"}, {"Out"})`, it
+ * cannot contains variable, only can contains const char* string.
+ *
+ * Infrt will parse all results before running for the generation of the above
+ * static declare, which leads to some functions being written in a long way,
+ * and the complicated ones may have hundreds of lines, which has certain side
+ * effects on the programming experience.
+ */
 KernelSignature ScaleOpArgumentMapping(const ArgumentMappingContext& ctx) {
   if (ctx.IsDenseTensorInput("X")) {
-    std::string scale_attr;
     if (ctx.HasInput("ScaleTensor")) {
       return KernelSignature(
           "scale", {"X"}, {"ScaleTensor", "bias", "bias_after_scale"}, {"Out"});
@@ -26,9 +54,19 @@ KernelSignature ScaleOpArgumentMapping(const ArgumentMappingContext& ctx) {
       return KernelSignature(
           "scale", {"X"}, {"scale", "bias", "bias_after_scale"}, {"Out"});
     }
+  } else if (ctx.IsSelectedRowsInput("X")) {
+    if (ctx.HasInput("ScaleTensor")) {
+      return KernelSignature("scale_sr",
+                             {"X"},
+                             {"ScaleTensor", "bias", "bias_after_scale"},
+                             {"Out"});
+    } else {
+      return KernelSignature(
+          "scale_sr", {"X"}, {"scale", "bias", "bias_after_scale"}, {"Out"});
+    }
+  } else {
+    return KernelSignature("unregistered", {}, {}, {});
   }
-  // TODO(chenweihang): support other cases after selected rows added
-  return KernelSignature("scale.unregistered", {}, {}, {});
 }
 
 }  // namespace pten