[PTen] Support SelectedRows in execution and remove scale OpKernel and InferShape (#39351)

* adapt selectedrows in execution * impl selected rows branch * support selectedrow in infershape utils * fix device compile failed * fix new exe test failed * revert some changes

[PTen] Support SelectedRows in execution and remove scale OpKernel and InferShape (#39351)
* adapt selectedrows in execution * impl selected rows branch * support selectedrow in infershape utils * fix device compile failed * fix new exe test failed * revert some changes
41eb2595 · Chen Weihang · GitHub · 42910361 · 41eb2595 · 41eb2595
26 changed file
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
@@ -175,7 +175,7 @@ TEST(Benchmark, EagerIntermediateMLPCPU) {
  }
 }

-USE_OP(scale);
+USE_OP_ITSELF(scale);
 USE_OP(elementwise_add);
 USE_OP(matmul_v2);
 USE_OP(reduce_sum);
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
@@ -185,7 +185,7 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) {
  }
 }

-USE_OP(scale);
+USE_OP_ITSELF(scale);
 USE_OP(matmul_v2);
 USE_OP(reduce_sum);
 USE_OP(reduce_sum_grad);

--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
@@ -211,7 +211,7 @@ TEST(Benchmark, FluidMLPCPU) {
 }  // namespace imperative
 }  // namespace paddle

-USE_OP(scale);
+USE_OP_ITSELF(scale);
 USE_OP(elementwise_add);
 USE_OP(matmul_v2);
 USE_OP(reduce_sum);
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
@@ -245,7 +245,7 @@ TEST(Benchmark, FluidMLPCUDA) {
 }  // namespace imperative
 }  // namespace paddle

-USE_OP(scale);
+USE_OP_ITSELF(scale);
 USE_OP(matmul_v2);
 USE_OP(reduce_sum);
 USE_OP(reduce_sum_grad);

--- a/paddle/fluid/framework/heter_pipeline_trainer_test.cc
+++ b/paddle/fluid/framework/heter_pipeline_trainer_test.cc
@@ -26,7 +26,7 @@
 #define _LINUX
 #endif

-USE_OP(scale);
+USE_OP_ITSELF(scale);
 USE_NO_KERNEL_OP(heter_listen_and_serv);
 namespace paddle {
 namespace framework {

--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -78,7 +78,6 @@ class InferShapeArgumentMappingContext : public pten::ArgumentMappingContext {
  const InferShapeContext& ctx_;
 };

-// TODO(chenweihang): Support SelectedRows later
 // TODO(chenweihang): Support TensorArray later
 class CompatMetaTensor : public pten::MetaTensor {
 public:
@@ -104,7 +103,14 @@ class CompatMetaTensor : public pten::MetaTensor {
  DDim dims() const override {
    if (is_runtime_) {
      auto* var = BOOST_GET_CONST(Variable*, var_);
-      return var->Get<LoDTensor>().dims();
+      if (var->IsType<pten::DenseTensor>()) {
+        return var->Get<pten::DenseTensor>().dims();
+      } else if (var->IsType<pten::SelectedRows>()) {
+        return var->Get<pten::SelectedRows>().dims();
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Currently, only can get dims from DenseTensor or SelectedRows."));
+      }
    } else {
      auto* var = BOOST_GET_CONST(VarDesc*, var_);
      return make_ddim(var->GetShape());
@@ -114,7 +120,14 @@ class CompatMetaTensor : public pten::MetaTensor {
  pten::DataType dtype() const override {
    if (is_runtime_) {
      auto* var = BOOST_GET_CONST(Variable*, var_);
-      return var->Get<LoDTensor>().dtype();
+      if (var->IsType<pten::DenseTensor>()) {
+        return var->Get<pten::DenseTensor>().dtype();
+      } else if (var->IsType<pten::SelectedRows>()) {
+        return var->Get<pten::SelectedRows>().dtype();
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Currently, only can get dtype from DenseTensor or SelectedRows."));
+      }
    } else {
      auto* var = BOOST_GET_CONST(VarDesc*, var_);
      return pten::TransToPtenDataType(var->GetDataType());
@@ -135,10 +148,16 @@ class CompatMetaTensor : public pten::MetaTensor {
  void set_dims(const DDim& dims) override {
    if (is_runtime_) {
      auto* var = BOOST_GET(Variable*, var_);
-      LoDTensor* tensor = var->GetMutable<LoDTensor>();
-      pten::DenseTensorUtils::GetMutableMeta(
-          static_cast<pten::DenseTensor*>(tensor))
-          ->dims = dims;
+      if (var->IsType<pten::DenseTensor>()) {
+        auto* tensor = var->GetMutable<pten::DenseTensor>();
+        pten::DenseTensorUtils::GetMutableMeta(tensor)->dims = dims;
+      } else if (var->IsType<pten::SelectedRows>()) {
+        auto* tensor = var->GetMutable<pten::SelectedRows>()->mutable_value();
+        pten::DenseTensorUtils::GetMutableMeta(tensor)->dims = dims;
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Currently, only can set dims from DenseTensor or SelectedRows."));
+      }
    } else {
      auto* var = BOOST_GET(VarDesc*, var_);
      var->SetShape(vectorize(dims));
@@ -148,10 +167,16 @@ class CompatMetaTensor : public pten::MetaTensor {
  void set_dtype(pten::DataType dtype) override {
    if (is_runtime_) {
      auto* var = BOOST_GET(Variable*, var_);
-      LoDTensor* tensor = var->GetMutable<LoDTensor>();
-      pten::DenseTensorUtils::GetMutableMeta(
-          static_cast<pten::DenseTensor*>(tensor))
-          ->dtype = dtype;
+      if (var->IsType<pten::DenseTensor>()) {
+        auto* tensor = var->GetMutable<pten::DenseTensor>();
+        pten::DenseTensorUtils::GetMutableMeta(tensor)->dtype = dtype;
+      } else if (var->IsType<pten::SelectedRows>()) {
+        auto* tensor = var->GetMutable<pten::SelectedRows>()->mutable_value();
+        pten::DenseTensorUtils::GetMutableMeta(tensor)->dtype = dtype;
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Currently, only can set dtype from DenseTensor or SelectedRows."));
+      }
    } else {
      auto* var = BOOST_GET(VarDesc*, var_);
      var->SetDataType(pten::TransToProtoVarType(dtype));
@@ -174,11 +199,14 @@ class CompatMetaTensor : public pten::MetaTensor {
  void share_lod(const MetaTensor& meta_tensor) override {
    if (is_runtime_) {
      auto* var = BOOST_GET(Variable*, var_);
-      LoDTensor* tensor = var->GetMutable<LoDTensor>();
-      pten::DenseTensorUtils::GetMutableMeta(
-          static_cast<pten::DenseTensor*>(tensor))
-          ->lod =
-          static_cast<const CompatMetaTensor&>(meta_tensor).GetRuntimeLoD();
+      if (var->IsType<pten::DenseTensor>()) {
+        auto* tensor = var->GetMutable<pten::DenseTensor>();
+        pten::DenseTensorUtils::GetMutableMeta(tensor)->lod =
+            static_cast<const CompatMetaTensor&>(meta_tensor).GetRuntimeLoD();
+      } else {
+        // NOTE(chenweihang): do nothing
+        // only LoDTensor need to share lod
+      }
    } else {
      auto* var = BOOST_GET(VarDesc*, var_);
      var->SetLoDLevel(static_cast<const CompatMetaTensor&>(meta_tensor)
@@ -191,7 +219,21 @@ class CompatMetaTensor : public pten::MetaTensor {
    set_dtype(meta_tensor.dtype());
    // VarDesc doesn't contains layout, so we cannot share layout
    // set_layout(meta_tensor.layout());
+
+    // special case 1: share lod of LoDTensor
    share_lod(meta_tensor);
+
+    // special case 2: share height and rows of SelectedRows in runtime
+    if (is_runtime_) {
+      auto* var = BOOST_GET(Variable*, var_);
+      if (var->IsType<pten::SelectedRows>()) {
+        auto* selected_rows = var->GetMutable<pten::SelectedRows>();
+        auto& input_selected_rows =
+            static_cast<const CompatMetaTensor&>(meta_tensor).GetSelectedRows();
+        selected_rows->set_rows(input_selected_rows.rows());
+        selected_rows->set_height(input_selected_rows.height());
+      }
+    }
  }

 private:
@@ -199,11 +241,23 @@ class CompatMetaTensor : public pten::MetaTensor {
    auto* var = BOOST_GET_CONST(Variable*, var_);
    return var->Get<LoDTensor>().lod();
  }
+
  int32_t GetCompileTimeLoD() const {
    auto* var = BOOST_GET_CONST(VarDesc*, var_);
    return var->GetLoDLevel();
  }

+  const pten::SelectedRows& GetSelectedRows() const {
+    PADDLE_ENFORCE_EQ(is_runtime_, true,
+                      platform::errors::Unavailable(
+                          "Only can get Tensor from MetaTensor in rumtime."));
+    auto* var = BOOST_GET_CONST(Variable*, var_);
+    PADDLE_ENFORCE_EQ(var->IsType<pten::SelectedRows>(), true,
+                      platform::errors::Unavailable(
+                          "The Tensor in MetaTensor is not SelectedRows."));
+    return var->Get<pten::SelectedRows>();
+  }
+
  InferShapeVarPtr var_;
  bool is_runtime_;
 };

--- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
@@ -21,7 +21,7 @@
 #include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/framework/program_desc.h"

-USE_OP(scale);
+USE_OP_ITSELF(scale);
 USE_OP(elementwise_mul);
 USE_OP(elementwise_add);
 USE_OP(elementwise_add_grad);

--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -393,7 +393,8 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
    platform::RecordEvent infershape_event("InferShape");
    // If it is OperatorBase, InferShape do nothing.
    if (op_with_kernel != nullptr)
-      op_with_kernel->InferShape(instr_node.InnerInferShapeContext().get());
+      op_with_kernel->Info().infer_shape_(
+          instr_node.InnerInferShapeContext().get());
  }

  if (op_with_kernel != nullptr &&

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1998,16 +1998,17 @@ void OperatorWithKernel::BuildPtenKernelContext(
    size_t end_idx = start_idx + ins_vector.size();

    for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
-      const framework::Tensor* tensor_in = nullptr;
+      const pten::TensorBase* tensor_in = nullptr;
      auto* var = ins_vector[offset];
-      if (var->IsType<framework::LoDTensor>()) {
-        tensor_in = &(var->Get<framework::LoDTensor>());
+      if (var->IsType<pten::DenseTensor>()) {
+        tensor_in = &(var->Get<pten::DenseTensor>());
+      } else if (var->IsType<pten::SelectedRows>()) {
+        tensor_in = &(var->Get<pten::SelectedRows>());
      } else {
        PADDLE_THROW(platform::errors::Unimplemented(
            "Unsupported input `%s` type when call pt kernel.",
            framework::ToTypeName(var->Type())));
-      }  // TODO(zyfncg): Add support for SelectedRows
-
+      }
      pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
    }
    pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i);
@@ -2021,17 +2022,20 @@ void OperatorWithKernel::BuildPtenKernelContext(
    size_t end_idx = start_idx + outs_vector.size();

    for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
-      framework::Tensor* tensor_out = nullptr;
+      pten::TensorBase* tensor_out = nullptr;
      auto* var = outs_vector[offset];
-      if (var->template IsType<framework::LoDTensor>()) {
-        tensor_out = var->template GetMutable<framework::LoDTensor>();
+      if (var->template IsType<pten::DenseTensor>()) {
+        tensor_out = var->template GetMutable<pten::DenseTensor>();
+      } else if (var->template IsType<pten::SelectedRows>()) {
+        tensor_out = var->template GetMutable<pten::SelectedRows>();
      } else {
        PADDLE_THROW(platform::errors::Unimplemented(
            "Unsupported output `%s` type when call pt kernel.",
            framework::ToTypeName(var->Type())));
-      }  // TODO(zyfncg): Add support for SelectedRows
+      }

-      experimental::ResetTensorByArgDef(tensor_out, output_defs.at(i));
+      experimental::ResetTensorDtypeAndLayoutByArgDef(tensor_out,
+                                                      output_defs.at(i));
      SetAllocationForOutputTenosr(
          tensor_out, pten::TransToFluidPlace(output_defs.at(i).backend));


--- a/paddle/fluid/framework/pten_utils.cc
+++ b/paddle/fluid/framework/pten_utils.cc
@@ -207,21 +207,40 @@ void InitDefaultKernelSignatureMap() {
  });
 }

-void SetAllocationForOutputTenosr(pten::DenseTensor* tensor,
+static void SetAllocationForUninitializedDenseTensor(
+    pten::DenseTensor* dense_tensor, const platform::Place& place) {
+  int dtype_size = dense_tensor->dtype() == DataType::UNDEFINED
+                       ? 0
+                       : experimental::SizeOf(dense_tensor->dtype());
+  int64_t numels = product(dense_tensor->dims());
+  numels = numels < 0 ? 0 : numels;
+  auto tmp_allocation_ptr = memory::Alloc(place, numels * dtype_size);
+  auto& deleter = tmp_allocation_ptr.get_deleter();
+  auto* allocation_ptr = tmp_allocation_ptr.release();
+  auto shared_allocation =
+      std::shared_ptr<pten::Allocation>(allocation_ptr, deleter);
+
+  dense_tensor->ResetHolder(shared_allocation);
+}
+
+void SetAllocationForOutputTenosr(pten::TensorBase* tensor,
                                  const platform::Place& place) {
-  if (!tensor->IsInitialized() || !(tensor->place() == place)) {
-    int dtype_size = tensor->dtype() == DataType::UNDEFINED
-                         ? 0
-                         : experimental::SizeOf(tensor->dtype());
-    int64_t numels = product(tensor->dims());
-    numels = numels < 0 ? 0 : numels;
-    auto tmp_allocation_ptr = memory::Alloc(place, numels * dtype_size);
-    auto& deleter = tmp_allocation_ptr.get_deleter();
-    auto* allocation_ptr = tmp_allocation_ptr.release();
-    auto shared_allocation =
-        std::shared_ptr<pten::Allocation>(allocation_ptr, deleter);
-
-    tensor->ResetHolder(shared_allocation);
+  if (pten::DenseTensor::classof(tensor)) {
+    auto* dense_tensor = static_cast<pten::DenseTensor*>(tensor);
+    if (!dense_tensor->IsInitialized() || !(dense_tensor->place() == place)) {
+      SetAllocationForUninitializedDenseTensor(dense_tensor, place);
+    }
+  } else if (pten::SelectedRows::classof(tensor)) {
+    auto* selected_rows = static_cast<pten::SelectedRows*>(tensor);
+    if (!selected_rows->value().IsInitialized() ||
+        !(selected_rows->place() == place)) {
+      SetAllocationForUninitializedDenseTensor(selected_rows->mutable_value(),
+                                               place);
+    }
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported tensor type is received when setting allocation for "
+        "output tensor."));
  }
 }


--- a/paddle/fluid/framework/pten_utils.h
+++ b/paddle/fluid/framework/pten_utils.h
@@ -63,7 +63,7 @@ class KernelArgsNameMaker {

 void InitDefaultKernelSignatureMap();

-void SetAllocationForOutputTenosr(pten::DenseTensor* tensor,
+void SetAllocationForOutputTenosr(pten::TensorBase* tensor,
                                  const platform::Place& place);

 // TODO(Wilber): support others device context.

--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -29,6 +29,9 @@
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/imperative/var_helper.h"

+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/selected_rows.h"
+
 DECLARE_bool(use_mkldnn);

 namespace paddle {
@@ -262,7 +265,17 @@ void BuildDygraphPtenKernelContext(
    size_t end_idx = start_idx + ins_vector.size();

    for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
-      const auto* tensor_in = GetTensorFromVar(ins_vector[offset]->Var());
+      const pten::TensorBase* tensor_in = nullptr;
+      auto& var = ins_vector[offset]->Var();
+      if (var.template IsType<pten::DenseTensor>()) {
+        tensor_in = &(var.template Get<pten::DenseTensor>());
+      } else if (var.template IsType<pten::SelectedRows>()) {
+        tensor_in = &(var.template Get<pten::SelectedRows>());
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported input `%s` type when call pt kernel.",
+            framework::ToTypeName(var.Type())));
+      }
      kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in);
    }
    kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
@@ -287,17 +300,21 @@ void BuildDygraphPtenKernelContext(
        kernel_ctx->EmplaceBackOutputWithoutSetRange({nullptr});
        continue;
      }
+
+      pten::TensorBase* tensor_out = nullptr;
      auto* var = outs_vector[offset]->MutableVar();
-      framework::Tensor* tensor_out = nullptr;
-      if (var->template IsType<framework::LoDTensor>()) {
-        tensor_out = var->template GetMutable<framework::LoDTensor>();
+      if (var->template IsType<pten::DenseTensor>()) {
+        tensor_out = var->template GetMutable<pten::DenseTensor>();
+      } else if (var->template IsType<pten::SelectedRows>()) {
+        tensor_out = var->template GetMutable<pten::SelectedRows>();
      } else {
        PADDLE_THROW(platform::errors::Unimplemented(
            "Unsupported output `%s` type when call pt kernel.",
            framework::ToTypeName(var->Type())));
-      }  // TODO(zyfncg): Add support for SelectedRows
+      }

-      experimental::ResetTensorByArgDef(tensor_out, output_defs.at(i));
+      experimental::ResetTensorDtypeAndLayoutByArgDef(tensor_out,
+                                                      output_defs.at(i));
      framework::SetAllocationForOutputTenosr(
          tensor_out, pten::TransToFluidPlace(output_defs.at(i).backend));


--- a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
+++ b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
@@ -33,7 +33,7 @@ using MultiVarMsg = ::paddle::distributed::MultiVariableMessage;
 using VarMsg = ::paddle::distributed::VariableMessage;
 DECLARE_double(eager_delete_tensor_gb);

-USE_OP(scale);
+USE_OP_ITSELF(scale);
 USE_NO_KERNEL_OP(heter_listen_and_serv);

 framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) {

--- a/paddle/fluid/operators/pscore/heter_server_test.cc
+++ b/paddle/fluid/operators/pscore/heter_server_test.cc
@@ -29,7 +29,7 @@ namespace distributed = paddle::distributed;
 using MultiVarMsg = ::paddle::distributed::MultiVariableMessage;
 using VarMsg = ::paddle::distributed::VariableMessage;

-USE_OP(scale);
+USE_OP_ITSELF(scale);

 std::shared_ptr<distributed::HeterServer> b_rpc_service;


--- a/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc
@@ -31,7 +31,7 @@ namespace distributed = paddle::distributed;
 using MultiVarMsg = ::paddle::distributed::MultiVariableMessage;
 using VarMsg = ::paddle::distributed::VariableMessage;

-USE_OP(scale);
+USE_OP_ITSELF(scale);
 USE_OP(send_and_recv);

 std::shared_ptr<distributed::HeterServer> b_rpc_service;

--- a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
@@ -35,7 +35,7 @@ namespace memory = paddle::memory;
 using MultiVarMsg = ::paddle::distributed::MultiVariableMessage;
 using VarMsg = ::paddle::distributed::VariableMessage;

-USE_OP(scale);
+USE_OP_ITSELF(scale);
 USE_OP(send_and_recv);

 std::shared_ptr<distributed::HeterServer> b_rpc_service2;

--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -12,49 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/operators/scale_op.h"
 #include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class OpDesc;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
+#include "paddle/pten/core/infermeta_utils.h"
+#include "paddle/pten/infermeta/unary.h"

 namespace paddle {
 namespace operators {

 class ScaleOp : public framework::OperatorWithKernel {
 public:
-  ScaleOp(const std::string &type, const framework::VariableNameMap &inputs,
-          const framework::VariableNameMap &outputs,
-          const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "scale");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "scale");
-
-    if (ctx->IsRuntime() && ctx->HasInput("ScaleTensor")) {
-      auto scale = ctx->Inputs("ScaleTensor");
-      PADDLE_ENFORCE_EQ(scale.size(), 1,
-                        platform::errors::InvalidArgument(
-                            "Input(ScaleTensor) size must be 1, "
-                            "but received size is %d.",
-                            scale.size()));
-    }
-
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
+  using framework::OperatorWithKernel::OperatorWithKernel;

  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext &ctx) const override {
@@ -150,32 +120,10 @@ DECLARE_INPLACE_OP_INFERER(ScaleOpInplaceInferer, {"X", "Out"});

 namespace ops = paddle::operators;

+DELCARE_INFER_SHAPE_FUNCTOR(scale, ScaleInferShapeFunctor,
+                            PT_INFER_META(pten::UnchangedInferMeta));
 REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker,
                  ops::ScaleGradMaker<paddle::framework::OpDesc>,
                  ops::ScaleGradMaker<paddle::imperative::OpBase>,
-                  ops::ScaleOpVarTypeInference, ops::ScaleOpInplaceInferer);
-REGISTER_OP_CPU_KERNEL(
-    scale, ops::ScaleKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ScaleKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ScaleKernel<paddle::platform::CPUDeviceContext,
-                     paddle::platform::bfloat16>,
-    ops::ScaleKernel<paddle::platform::CPUDeviceContext, uint8_t>,
-    ops::ScaleKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::ScaleKernel<paddle::platform::CPUDeviceContext, int16_t>,
-    ops::ScaleKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ScaleKernel<paddle::platform::CPUDeviceContext, int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(
-    scale,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, float>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, double>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
-                                   uint8_t>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
-                                   int16_t>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, int>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
-                                   int64_t>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
-                                   paddle::platform::float16>);
+                  ScaleInferShapeFunctor, ops::ScaleOpVarTypeInference,
+                  ops::ScaleOpInplaceInferer);
--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/pten_utils.h"
-
-// only can include the headers in paddle/top/api dirs
-#include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/kernels/scale_kernel.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-static inline T GetAttrFromTensor(const framework::Tensor* tensor) {
-  const auto* tensor_data = tensor->data<T>();
-  framework::Tensor cpu_tensor;
-  if (platform::is_gpu_place(tensor->place()) ||
-      platform::is_npu_place(tensor->place())) {
-    paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(),
-                                      &cpu_tensor);
-    tensor_data = cpu_tensor.data<T>();
-  }
-  return tensor_data[0];
-}
-
-// See Note [ Why still keep the original kernel implementation? ]
-template <typename DeviceContext, typename T>
-class ScaleKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& ctx) const {
-    auto* in_var = ctx.InputVar("X");
-
-    auto bias = ctx.Attr<float>("bias");
-    auto bias_after_scale = ctx.Attr<bool>("bias_after_scale");
-    auto scale = ctx.Attr<float>("scale");
-    auto* out_var = ctx.OutputVar("Out");
-
-    if (ctx.HasInput("ScaleTensor")) {
-      auto* scale_tensor = ctx.Input<framework::Tensor>("ScaleTensor");
-      scale = static_cast<float>(GetAttrFromTensor<T>(scale_tensor));
-    }
-
-    auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var);
-    auto* out =
-        framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var);
-    out->mutable_data<T>(in->place());
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
-
-    // call new kernel
-    if (in_var->IsType<pten::SelectedRows>()) {
-      pten::ScaleSR<T>(
-          static_cast<const typename framework::ConvertToPtenContext<
-              DeviceContext>::TYPE&>(dev_ctx),
-          in_var->Get<pten::SelectedRows>(), scale, bias, bias_after_scale,
-          out_var->GetMutable<pten::SelectedRows>());
-    } else {
-      pten::ScaleKernel<T>(
-          static_cast<const typename framework::ConvertToPtenContext<
-              DeviceContext>::TYPE&>(dev_ctx),
-          *in, scale, bias, bias_after_scale, out);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/scale_op_mlu.cc
+++ b/paddle/fluid/operators/scale_op_mlu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/operators/scale_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"

 namespace paddle {

--- a/paddle/fluid/operators/scale_op_npu.cc
+++ b/paddle/fluid/operators/scale_op_npu.cc
@@ -12,12 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/operators/scale_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"

 namespace paddle {
 namespace operators {

+template <typename T>
+static inline T GetAttrFromTensor(const framework::Tensor* tensor) {
+  const auto* tensor_data = tensor->data<T>();
+  framework::Tensor cpu_tensor;
+  if (platform::is_gpu_place(tensor->place()) ||
+      platform::is_npu_place(tensor->place())) {
+    paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(),
+                                      &cpu_tensor);
+    tensor_data = cpu_tensor.data<T>();
+  }
+  return tensor_data[0];
+}
+
 template <typename T>
 class ScaleNPUKernel : public framework::OpKernel<T> {
 public:

--- a/paddle/fluid/operators/scale_op_xpu.cc
+++ b/paddle/fluid/operators/scale_op_xpu.cc
@@ -14,8 +14,8 @@ limitations under the License. */

 #ifdef PADDLE_WITH_XPU

-#include "paddle/fluid/operators/scale_op.h"
 #include <string>
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/pten/kernels/scale_kernel.h"

 namespace paddle {

--- a/paddle/pten/api/lib/utils/tensor_utils.cc
+++ b/paddle/pten/api/lib/utils/tensor_utils.cc
@@ -198,12 +198,25 @@ pten::ScalarArray MakePtenScalarArrayFromVarList(
  return {vector_data};
 }

-void ResetTensorByArgDef(pten::DenseTensor* dst,
-                         const pten::TensorArgDef& arg_def) {
+void ResetTensorDtypeAndLayoutByArgDef(pten::TensorBase* dst,
+                                       const pten::TensorArgDef& arg_def) {
  VLOG(5) << "ResetTensor by TensorArgDef.";
-  auto* meta = pten::DenseTensorUtils::GetMutableMeta(dst);
-  meta->dtype = arg_def.dtype;
-  meta->layout = arg_def.layout;
+  if (pten::DenseTensor::classof(dst)) {
+    auto* dense_t = static_cast<pten::DenseTensor*>(dst);
+    auto* meta = pten::DenseTensorUtils::GetMutableMeta(dense_t);
+    meta->dtype = arg_def.dtype;
+    meta->layout = arg_def.layout;
+  } else if (pten::SelectedRows::classof(dst)) {
+    auto* selected_rows = static_cast<pten::SelectedRows*>(dst);
+    auto* meta =
+        pten::DenseTensorUtils::GetMutableMeta(selected_rows->mutable_value());
+    meta->dtype = arg_def.dtype;
+    meta->layout = arg_def.layout;
+  } else {
+    PADDLE_THROW(pten::errors::Unimplemented(
+        "Unsupported tensor type is received when reseting tensor dtype and "
+        "layout by argument definition."));
+  }
 }

 }  // namespace experimental

--- a/paddle/pten/api/lib/utils/tensor_utils.h
+++ b/paddle/pten/api/lib/utils/tensor_utils.h
@@ -45,8 +45,8 @@ pten::ScalarArray MakePtenScalarArrayFromVar(
 pten::ScalarArray MakePtenScalarArrayFromVarList(
    const std::vector<framework::Variable*>& variable_list);

-void ResetTensorByArgDef(pten::DenseTensor* dst,
-                         const pten::TensorArgDef& arg_def);
+void ResetTensorDtypeAndLayoutByArgDef(pten::TensorBase* dst,
+                                       const pten::TensorArgDef& arg_def);

 }  // namespace experimental
 }  // namespace paddle
--- a/paddle/pten/core/kernel_context.cc
+++ b/paddle/pten/core/kernel_context.cc
@@ -48,10 +48,6 @@ void KernelContext::EmplaceBackOutputWithoutSetRange(TensorBase* output) {
  outputs_.emplace_back(output);
 }

-void KernelContext::SetOutputWithoutSetRange(int index, TensorBase* output) {
-  outputs_.at(index) = output;
-}
-
 void KernelContext::EmplaceBackOutputs(
    paddle::SmallVector<TensorBase*> outputs) {
  int index = outputs_.size();
@@ -103,15 +99,4 @@ const std::pair<int, int>& KernelContext::OutputRangeAt(size_t idx) const {
  return output_range_.at(idx);
 }

-std::pair<int, int>& KernelContext::MutableInputRangeAt(size_t idx) {
-  return input_range_[idx];
-}
-
-std::pair<int, int>& KernelContext::MutableOutputRangeAt(size_t idx) {
-  return output_range_[idx];
-}
-
-// Temporary method: For compatible with fluid Tensor and improve performance
-// Only deal with DenseTensor now
-void KernelContext::ClearData() { attrs_.clear(); }
 }  // namespace pten
--- a/paddle/pten/core/kernel_context.h
+++ b/paddle/pten/core/kernel_context.h
@@ -26,10 +26,8 @@

 namespace pten {

-using DeviceContext = pten::DeviceContext;
-
 /**
- * Note: KernelContext doesn't manage the life if DeviceContext and Tensor
+ * Note: KernelContext doesn't manage the life of DeviceContext and Tensor
 *
 * Note: KernelContext does not couple the concept of framework,
 *       its constructor can only take the members it needs as parameters,
@@ -59,17 +57,15 @@ class KernelContext {

  void EmplaceBackOutputs(paddle::SmallVector<TensorBase*> outputs);

-  void SetOutputWithoutSetRange(int index, TensorBase* output);
-
  void EmplaceBackAttr(paddle::any attr);

  const std::pair<int, int>& InputRangeAt(size_t idx) const;

  const std::pair<int, int>& OutputRangeAt(size_t idx) const;

-  std::pair<int, int>& MutableInputRangeAt(size_t idx);
+  void AssignInputRange(std::pair<int, int>&& range, size_t idx);

-  std::pair<int, int>& MutableOutputRangeAt(size_t idx);
+  void AssignOutputRange(std::pair<int, int>&& range, size_t idx);

  template <typename TensorType>
  const TensorType& InputAt(size_t idx) const {
@@ -90,15 +86,11 @@ class KernelContext {
    for (size_t i = start; i < end; ++i) {
      auto t = static_cast<const TensorType*>(inputs_.at(i));
      v.emplace_back(*t);
-      inputs_.at(i) = nullptr;
+      inputs_[i] = nullptr;
    }
    return v;
  }

-  void AssignInputRange(std::pair<int, int>&& range, size_t idx);
-
-  void AssignOutputRange(std::pair<int, int>&& range, size_t idx);
-
  template <typename TensorType>
  TensorType* MutableOutputAt(size_t idx) {
    return static_cast<TensorType*>(outputs_.at(idx));
@@ -110,7 +102,6 @@ class KernelContext {
    for (size_t i = start; i < end; ++i) {
      v.emplace_back(static_cast<TensorType*>(outputs_.at(i)));
    }
-
    return v;
  }

@@ -124,25 +115,17 @@ class KernelContext {
    }
  }

-  // Temporary method: For compatible with fluid Tensor and improve performance
-  // Only deal with DenseTensor now
-  void ClearData();
-
  size_t InputsSize() const { return inputs_.size(); }
  size_t OutputsSize() const { return outputs_.size(); }
  size_t AttrsSize() const { return attrs_.size(); }

 private:
-  // DeviceContext base class
  DeviceContext* dev_ctx_;

-  // TODO(chenweihang): Tensor -> Tensor*, Tensor should by managed `scope`
-  // Note: can't use API Tensor here, the inference don't use this API Tensor
  paddle::SmallVector<const TensorBase*> inputs_;
  paddle::SmallVector<TensorBase*> outputs_;
  paddle::SmallVector<paddle::any> attrs_;

-  // Only contains input like list[Tensor] need `range`
  paddle::SmallVector<std::pair<int, int>> input_range_;
  paddle::SmallVector<std::pair<int, int>> output_range_;
 };

--- a/paddle/pten/ops/compat/scale_sig.cc
+++ b/paddle/pten/ops/compat/scale_sig.cc
@@ -16,9 +16,37 @@ limitations under the License. */

 namespace pten {

+/**
+ * Note [ Why does the ArgumentMapping function need to be so complicated? ]
+ *
+ * In order to meet the requirements of infrt, the function used to match Op
+ * and Kernel parameters, need to be placed in pten as a compatible component,
+ * and does not depend on fluid.
+ *
+ * Because infrt not only needs to dynamically call this argument mapping
+ * function at runtime, but also needs to statically declare all possible
+ * results of the function before running without any information.
+ *
+ * The infrt declare like:
+ *
+ * def PDKEL_Reshape_to_CPU : Pat<
+ *     (PD_ReshapeOp $x, $shape_tensor， $shape_attr), // OpMaker arguements
+ *     (PDKEL_ReshapeKernelAttr $x, fn($shape_attr)>;  // Kernel arguments
+ * def PDKEL_Reshape_to_CPU : Pat<
+ *     (PD_ReshapeOp $x, $shape_tensor， $shape_attr),
+ *     (PDKEL_ReshapeKernelAttr $x, fn($shape_tensor)>;
+ *
+ * Therefore, we need to write out each result of the argument mapping function,
+ * like `KernelSignature("full", {}, {"ShapeTensor", "value"}, {"Out"})`, it
+ * cannot contains variable, only can contains const char* string.
+ *
+ * Infrt will parse all results before running for the generation of the above
+ * static declare, which leads to some functions being written in a long way,
+ * and the complicated ones may have hundreds of lines, which has certain side
+ * effects on the programming experience.
+ */
 KernelSignature ScaleOpArgumentMapping(const ArgumentMappingContext& ctx) {
  if (ctx.IsDenseTensorInput("X")) {
-    std::string scale_attr;
    if (ctx.HasInput("ScaleTensor")) {
      return KernelSignature(
          "scale", {"X"}, {"ScaleTensor", "bias", "bias_after_scale"}, {"Out"});
@@ -26,9 +54,19 @@ KernelSignature ScaleOpArgumentMapping(const ArgumentMappingContext& ctx) {
      return KernelSignature(
          "scale", {"X"}, {"scale", "bias", "bias_after_scale"}, {"Out"});
    }
+  } else if (ctx.IsSelectedRowsInput("X")) {
+    if (ctx.HasInput("ScaleTensor")) {
+      return KernelSignature("scale_sr",
+                             {"X"},
+                             {"ScaleTensor", "bias", "bias_after_scale"},
+                             {"Out"});
+    } else {
+      return KernelSignature(
+          "scale_sr", {"X"}, {"scale", "bias", "bias_after_scale"}, {"Out"});
+    }
+  } else {
+    return KernelSignature("unregistered", {}, {}, {});
  }
-  // TODO(chenweihang): support other cases after selected rows added
-  return KernelSignature("scale.unregistered", {}, {}, {});
 }

 }  // namespace pten