[Pten]Remove reshape and elementwise_add's registry code in Fluid (#39317)

* remove reshape and elementwise_add registry * delete code * fix bugs when run ci ut * remove log * fix bugs when run unit test * fix bugs when run unit test * fix bugs when run cinn * fix bugs when run ci-mac-python3 * fix compile bugs * fix compile bugs * fix compile bugs * fix bugs when run kunlun * fix bugs when compile * update code according comment

[Pten]Remove reshape and elementwise_add's registry code in Fluid (#39317)
* remove reshape and elementwise_add registry * delete code * fix bugs when run ci ut * remove log * fix bugs when run unit test * fix bugs when run unit test * fix bugs when run cinn * fix bugs when run ci-mac-python3 * fix compile bugs * fix compile bugs * fix compile bugs * fix bugs when run kunlun * fix bugs when compile * update code according comment
c6478270 · YuanRisheng · GitHub · 4157579e · c6478270 · c6478270
36 changed file
--- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
@@ -25,7 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 USE_OP(fill_constant);
 namespace paddle {

--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
@@ -176,6 +176,6 @@ TEST(Benchmark, EagerIntermediateMLPCPU) {
 }
 USE_OP_ITSELF(scale);
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 USE_OP(matmul_v2);
 USE_OP(reduce_sum);
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
@@ -189,6 +189,6 @@ USE_OP_ITSELF(scale);
 USE_OP(matmul_v2);
 USE_OP(reduce_sum);
 USE_OP(reduce_sum_grad);
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 #endif  // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
@@ -212,6 +212,6 @@ TEST(Benchmark, FluidMLPCPU) {
 }  // namespace paddle
 USE_OP_ITSELF(scale);
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 USE_OP(matmul_v2);
 USE_OP(reduce_sum);
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
@@ -249,6 +249,6 @@ USE_OP_ITSELF(scale);
 USE_OP(matmul_v2);
 USE_OP(reduce_sum);
 USE_OP(reduce_sum_grad);
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 #endif  // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
--- a/paddle/fluid/eager/tests/task_tests/generated_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/generated_test.cc
@@ -123,5 +123,5 @@ TEST(Generated, ElementwiseAdd) {
 }  // namespace egr
 USE_OP(sigmoid);
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 USE_OP(matmul_v2);
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/pten/core/kernel_factory.h"
 namespace paddle {
 namespace framework {
@@ -271,25 +272,41 @@ bool FuseOptimizerOpPass::OpWithKernelSupportCPUAndGPU(
  if (op_type == "c_sync_calc_stream" || op_type == "c_sync_comm_stream") {
    return true;
  }
+  bool support_cpu = false;
+  bool support_gpu = false;
+  auto &kernel_factory = pten::KernelFactory::Instance();
+  auto kernel_key_map =
+      kernel_factory.SelectKernelMap(pten::TransToPtenKernelName(op_type));
+  bool has_op_kernel = kernel_key_map.size() > 0 ? true : false;
+  for (auto &kernel : kernel_key_map) {
+    if (platform::is_gpu_place(
+            pten::TransToPtenPlace(kernel.first.backend()))) {
+      support_gpu = true;
+    } else if (platform::is_cpu_place(
+                   pten::TransToPtenPlace(kernel.first.backend()))) {
+      support_cpu = true;
+    }
+  }
+  if (!support_cpu || !support_gpu) {
    auto &all_kernels = OperatorWithKernel::AllOpKernels();
    auto it = all_kernels.find(op_type);
    // skip op not has kernel
    if (it != all_kernels.end()) {
-    bool support_cpu = false;
+      has_op_kernel = true;
-    bool support_gpu = false;
      for (auto &kernel_pair : it->second) {
        if (platform::is_cpu_place(kernel_pair.first.place_)) {
          support_cpu = true;
-      }
+        } else if (platform::is_gpu_place(kernel_pair.first.place_)) {
-      if (platform::is_gpu_place(kernel_pair.first.place_)) {
          support_gpu = true;
        }
      }
+    }
+  }
  VLOG(6) << "Op check: " << op_type << ", support CPU: " << support_cpu
          << ", support GPU: " << support_gpu;
-    return support_cpu && support_gpu;
+  return has_op_kernel ? (support_cpu && support_gpu) : true;
-  }
-  return true;
 }
 bool FuseOptimizerOpPass::GradGeneratedOpKernelCheck(

--- a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc
@@ -26,7 +26,7 @@
 USE_OP(mul);
 USE_OP(cinn_launch);
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 namespace paddle::framework {
 using Name2VarInfoMap =

--- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
@@ -23,8 +23,8 @@
 USE_OP_ITSELF(scale);
 USE_OP(elementwise_mul);
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
-USE_OP(elementwise_add_grad);
+USE_OP_ITSELF(elementwise_add_grad);
 DECLARE_double(eager_delete_tensor_gb);

--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
@@ -29,7 +29,7 @@ USE_OP(batch_norm);
 USE_OP_DEVICE_KERNEL(batch_norm, MKLDNN);
 USE_OP(conv2d_transpose);
 USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN);
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
 USE_OP(gelu);
 USE_OP_DEVICE_KERNEL(gelu, MKLDNN);

--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc
@@ -22,7 +22,7 @@
 USE_OP(softmax);
 USE_OP_DEVICE_KERNEL(softmax, MKLDNN);
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
 USE_OP(leaky_relu);
 USE_OP_DEVICE_KERNEL(leaky_relu, MKLDNN);

--- a/paddle/fluid/framework/naive_executor_test.cc
+++ b/paddle/fluid/framework/naive_executor_test.cc
@@ -67,4 +67,4 @@ TEST(NaiveExecutor, Basic) {
 }  // namespace framework
 }  // namespace paddle
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -25,12 +25,12 @@ USE_OP(fill_constant);
 USE_OP(uniform_random);
 USE_OP(lookup_table);
 USE_OP(transpose2);
-USE_OP(reshape2);
+USE_OP_ITSELF(reshape2);
 USE_OP(split);
 USE_OP(slice);
 USE_OP(concat);
 USE_OP(matmul);
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 USE_OP(sigmoid);
 USE_OP(tanh);
 USE_OP(elementwise_mul);
@@ -39,9 +39,9 @@ USE_OP(reduce_mean);
 USE_OP(reduce_sum);
 USE_OP(reduce_sum_grad);
 USE_OP(reduce_mean_grad);
-USE_OP(reshape2_grad);
+USE_OP_ITSELF(reshape2_grad);
 USE_OP(softmax_with_cross_entropy_grad);
-USE_OP(elementwise_add_grad);
+USE_OP_ITSELF(elementwise_add_grad);
 USE_OP(matmul_grad);
 USE_OP(square);
 USE_OP(transpose2_grad);

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1336,8 +1336,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
    const ExecutionContext& ctx) const {
-  auto& dev_ctx = ctx.device_context();
  auto expected_kernel_key = this->GetExpectedKernelType(ctx);
  if (HasAttr("op_device")) {
    if (Attr<std::string>("op_device") == "cpu") {
@@ -1354,12 +1352,20 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
      }
      // when the Op that only has CPUKernel is assigned to GPU, the CPUKernel
      // will be executed and a warning will be given at the same time.
+      expected_kernel_key.place_ = platform::CPUPlace();
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      if (SupportGPU()) {
+        auto& dev_ctx = ctx.device_context();
        expected_kernel_key.place_ = dev_ctx.GetPlace();
-      } else if (SupportNPU()) {
+      }
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+      if (SupportNPU()) {
+        auto& dev_ctx = ctx.device_context();
        expected_kernel_key.place_ = dev_ctx.GetPlace();
-      } else {
+      }
-        expected_kernel_key.place_ = platform::CPUPlace();
+#endif
+      if (platform::is_cpu_place(expected_kernel_key.place_)) {
        LOG_FIRST_N(WARNING, 1)
            << "Op(" << type_
            << ") has no CUDA implementation. It will be assigned to CPUPlace.";
@@ -1934,12 +1940,10 @@ Scope* OperatorWithKernel::PreparePtenData(
  for (size_t i = 0; i < input_defs.size(); ++i) {
    auto& in_def = input_defs.at(i);
-    auto it = ctx->inputs.find(input_names[i]);
+    if (ctx->inputs.find(input_names[i]) == ctx->inputs.end()) {
-    if (it == ctx->inputs.end()) {
      continue;
    }
+    auto& ins_vector = ctx->inputs.at(input_names[i]);
-    auto& ins_vector = it->second;
    auto& name_vec = name_map.at(input_names[i]);
    bool should_skip_input =
        no_buffer_ins && no_buffer_ins->count(input_names[i]) > 0;
@@ -1950,7 +1954,6 @@ Scope* OperatorWithKernel::PreparePtenData(
      if (var == nullptr || !VarIsTensor(*var)) {
        continue;
      }
      auto* tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var);
      // When no_buffer_ins then checking of Tensor::holder_ is

--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc
@@ -661,6 +661,6 @@ TEST(BuildCinnPassTest, NoNeedBufferInput) {
 USE_PASS(build_cinn_pass);
 USE_OP(mul);
 USE_OP(relu);
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 USE_OP(relu_grad);
-USE_OP(elementwise_add_grad);
+USE_OP_ITSELF(elementwise_add_grad);
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
@@ -302,4 +302,4 @@ USE_PASS(build_cinn_pass);
 USE_PASS(graph_viz_pass);
 USE_OP(mul);
 USE_OP(relu);
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -270,7 +270,8 @@ void BuildDygraphPtenKernelContext(
      kernel_ctx->EmplaceBackInputWithoutSetRange(nullptr);
      auto end_idx = start_idx + 1;
      kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
-    } else {
+      continue;
+    }
    auto ins_vector = it->second;
    size_t end_idx = start_idx + ins_vector.size();
@@ -290,7 +291,6 @@ void BuildDygraphPtenKernelContext(
    }
    kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
  }
-  }
  for (size_t i = 0; i < output_names.size(); ++i) {
    size_t start_idx = (i == 0 ? 0 : kernel_ctx->OutputRangeAt(i - 1).second);
@@ -468,8 +468,7 @@ void PreparePtenData(const pten::Kernel& pt_kernel,
  for (size_t i = 0; i < input_names.size(); ++i) {
    auto& in_def = input_defs.at(i);
-    auto it = ins.find(input_names[i]);
+    if (ins.find(input_names[i]) == ins.end()) {
-    if (it == ins.end()) {
      continue;
    }
    auto& ins_vector = ins.at(input_names[i]);

--- a/paddle/fluid/imperative/tests/test_hooks.cc
+++ b/paddle/fluid/imperative/tests/test_hooks.cc
@@ -265,5 +265,5 @@ TEST(TestHooks, TestGradVarLeafBackwardHookWithSortedGradAccmulated) {
 USE_OP(mul);
 USE_OP(mul_grad);
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
-USE_OP(elementwise_add_grad);
+USE_OP_ITSELF(elementwise_add_grad);
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
@@ -553,4 +553,4 @@ USE_OP(mul);
 USE_OP(mul_grad);
 USE_OP(reduce_sum);
 USE_OP(reduce_sum_grad);
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
--- a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc
@@ -103,5 +103,5 @@ TEST(elementwise_op, plugin) {
 }  // namespace inference
 }  // namespace paddle
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 USE_OP(elementwise_mul);
--- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
@@ -28,7 +28,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/init.h"
 USE_OP(cinn_launch);
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
@@ -32,7 +32,7 @@ limitations under the License. */
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
 USE_OP_DEVICE_KERNEL(c_sync_calc_stream, NPU);

--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
@@ -117,55 +117,6 @@ REGISTER_OPERATOR(elementwise_add_triple_grad, ops::ElementwiseOpTripleGrad,
                  ops::ElementwiseTripleGradOpInplaceInferer,
                  ops::ElementwiseTripleGradNoBufVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_add,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex<float>>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_add_grad,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex<float>>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext,
-                                  paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_add_grad_grad,
-    ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        float>,
-    ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        double>,
-    ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int>,
-    ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int64_t>,
-    ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<float>>,
-    ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<double>>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_add_triple_grad,
-    ops::ElementwiseAddTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        float>,
-    ops::ElementwiseAddTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        double>,
-    ops::ElementwiseAddTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int>,
-    ops::ElementwiseAddTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int64_t>,
-    ops::ElementwiseAddTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<float>>,
-    ops::ElementwiseAddTripleGradKernel<paddle::platform::CPUDeviceContext,
-                                        paddle::platform::complex<double>>);
 // A specialization elementwise_add operator, used in gradient accumulation with
 // inplace addto.
 REGISTER_OPERATOR(

--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -18,51 +18,6 @@ limitations under the License. */
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
-namespace paddle {
-namespace operators {}  // namespace operators
-}  // namespace paddle
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_add, ops::ElementwiseAddKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex<float>>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_add_grad,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext,
-                                  plat::complex<float>>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext,
-                                  plat::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_add_grad_grad,
-    ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex<float>>,
-    ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_add_triple_grad,
-    ops::ElementwiseAddTripleGradKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseAddTripleGradKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseAddTripleGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseAddTripleGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseAddTripleGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseAddTripleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex<float>>,
-    ops::ElementwiseAddTripleGradKernel<plat::CUDADeviceContext,
-                                        plat::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
    grad_add, ops::ElementwiseAddKernel<plat::CUDADeviceContext, float>,
    ops::ElementwiseAddKernel<plat::CUDADeviceContext, double>,

--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -43,73 +43,5 @@ class ElementwiseAddKernel : public framework::OpKernel<T> {
  }
 };
-template <typename DeviceContext, typename T>
-class ElementwiseAddGradKernel : public ElemwiseGradKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    using Tensor = framework::Tensor;
-    auto *x = ctx.Input<Tensor>("X");
-    auto *y = ctx.Input<Tensor>("Y");
-    auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    const auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    int axis = ctx.Attr<int>("axis");
-    pten::AddGradKernel<T>(
-        static_cast<const typename framework::ConvertToPtenContext<
-            DeviceContext>::TYPE &>(dev_ctx),
-        *x, *y, *dout, axis, dx, dy);
-  }
-};
-template <typename DeviceContext, typename T>
-class ElementwiseAddDoubleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    using Tensor = framework::Tensor;
-    auto *y = ctx.Input<Tensor>("Y");
-    auto *dout = ctx.Input<Tensor>("DOut");
-    auto *ddx = ctx.Input<Tensor>("DDX");
-    auto *ddy = ctx.Input<Tensor>("DDY");
-    auto *ddout = ctx.Output<Tensor>("DDOut");
-    const auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    int axis = ctx.Attr<int>("axis");
-    paddle::optional<const pten::DenseTensor &> ddx_optional = paddle::none;
-    paddle::optional<const pten::DenseTensor &> ddy_optional = paddle::none;
-    if (ddx != nullptr) {
-      ddx_optional = *ddx;
-    }
-    if (ddy != nullptr) {
-      ddy_optional = *ddy;
-    }
-    pten::AddDoubleGradKernel<T>(
-        static_cast<const typename framework::ConvertToPtenContext<
-            DeviceContext>::TYPE &>(dev_ctx),
-        *y, ddx_optional, ddy_optional, *dout, axis, ddout);
-  }
-};
-template <typename DeviceContext, typename T>
-class ElementwiseAddTripleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    using Tensor = framework::Tensor;
-    auto *ddx = ctx.Input<Tensor>("DDX");
-    auto *ddy = ctx.Input<Tensor>("DDY");
-    auto *d_ddout = ctx.Input<Tensor>("D_DDOut");
-    auto *d_ddx = ctx.Output<Tensor>("D_DDX");
-    auto *d_ddy = ctx.Output<Tensor>("D_DDY");
-    const auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    int axis = ctx.Attr<int>("axis");
-    pten::AddTripleGradKernel<T>(
-        static_cast<const typename framework::ConvertToPtenContext<
-            DeviceContext>::TYPE &>(dev_ctx),
-        *ddx, *ddy, *d_ddout, axis, d_ddx, d_ddy);
-  }
-};
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
@@ -31,7 +31,7 @@ limitations under the License. */
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
 USE_OP(elementwise_sub);
 USE_OP_DEVICE_KERNEL(elementwise_sub, NPU);

--- a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc
@@ -18,7 +18,7 @@
 #include "paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h"
 #include "paddle/fluid/platform/place.h"
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
@@ -22,7 +22,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/feed_forward_test.cu
+++ b/paddle/fluid/operators/feed_forward_test.cu
@@ -27,7 +27,7 @@ namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 USE_OP(matmul);
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 // get paddle matmul op results as baseline
 template <typename T>

--- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
@@ -25,7 +25,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
 USE_OP(elementwise_mul);
 USE_OP_DEVICE_KERNEL(elementwise_mul, MKLDNN);

--- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc
@@ -25,7 +25,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
 USE_OP(relu);
 USE_OP_DEVICE_KERNEL(relu, MKLDNN);

--- a/paddle/fluid/operators/op_debug_string_test.cc
+++ b/paddle/fluid/operators/op_debug_string_test.cc
@@ -18,7 +18,7 @@
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
-USE_OP(elementwise_add_grad);
+USE_OP_ITSELF(elementwise_add_grad);
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -657,30 +657,6 @@ REGISTER_OPERATOR(reshape2_grad_grad, ops::Reshape2DoubleGradOp,
                  ops::ReshapeDoubleGradInplaceInferer,
                  ops::ReshapeDoubleGradOpNoNeedBufferVarInferer);
-REGISTER_OP_CPU_KERNEL_FUNCTOR(
-    reshape2, float, ops::ReshapeKernel, double, ops::ReshapeKernel, int8_t,
-    ops::ReshapeKernel, uint8_t, ops::ReshapeKernel, int, ops::ReshapeKernel,
-    int64_t, ops::ReshapeKernel, bool, ops::ReshapeKernel,
-    paddle::platform::bfloat16, ops::ReshapeKernel,
-    paddle::platform::complex<float>, ops::ReshapeKernel,
-    paddle::platform::complex<double>, ops::ReshapeKernel);
-REGISTER_OP_CPU_KERNEL_FUNCTOR(
-    reshape2_grad, float, ops::ReshapeGradKernel, double,
-    ops::ReshapeGradKernel, int, ops::ReshapeGradKernel, uint8_t,
-    ops::ReshapeGradKernel, int64_t, ops::ReshapeGradKernel, bool,
-    ops::ReshapeGradKernel, paddle::platform::bfloat16, ops::ReshapeGradKernel,
-    paddle::platform::complex<float>, ops::ReshapeGradKernel,
-    paddle::platform::complex<double>, ops::ReshapeGradKernel);
-REGISTER_OP_CPU_KERNEL_FUNCTOR(
-    reshape2_grad_grad, float, ops::ReshapeDoubleGradKernel, double,
-    ops::ReshapeDoubleGradKernel, int, ops::ReshapeDoubleGradKernel, uint8_t,
-    ops::ReshapeDoubleGradKernel, int64_t, ops::ReshapeDoubleGradKernel, bool,
-    ops::ReshapeDoubleGradKernel, paddle::platform::bfloat16,
-    ops::ReshapeDoubleGradKernel, paddle::platform::complex<float>,
-    ops::ReshapeDoubleGradKernel, paddle::platform::complex<double>,
-    ops::ReshapeDoubleGradKernel);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
                                ops::ReshapeKernel, int, ops::ReshapeKernel,
@@ -695,45 +671,4 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
                                ops::ReshapeGradKernel, plat::float16,
                                ops::ReshapeGradKernel, plat::bfloat16,
                                ops::ReshapeGradKernel);
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
-                                ops::ReshapeKernel, int, ops::ReshapeKernel,
-                                uint8_t, ops::ReshapeKernel, int64_t,
-                                ops::ReshapeKernel, plat::float16,
-                                ops::ReshapeKernel, bool, ops::ReshapeKernel,
-                                plat::complex<float>, ops::ReshapeKernel,
-                                plat::complex<double>, ops::ReshapeKernel,
-                                plat::bfloat16, ops::ReshapeKernel);
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(
-    reshape2_grad, float, ops::ReshapeGradKernel, double,
-    ops::ReshapeGradKernel, int, ops::ReshapeGradKernel, uint8_t,
-    ops::ReshapeGradKernel, int64_t, ops::ReshapeGradKernel, plat::float16,
-    ops::ReshapeGradKernel, bool, ops::ReshapeGradKernel, plat::complex<float>,
-    ops::ReshapeGradKernel, plat::complex<double>, ops::ReshapeGradKernel,
-    plat::bfloat16, ops::ReshapeGradKernel);
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(
-    reshape2_grad_grad, float, ops::ReshapeDoubleGradKernel, double,
-    ops::ReshapeDoubleGradKernel, int, ops::ReshapeDoubleGradKernel, uint8_t,
-    ops::ReshapeDoubleGradKernel, int64_t, ops::ReshapeDoubleGradKernel,
-    plat::float16, ops::ReshapeDoubleGradKernel, bool,
-    ops::ReshapeDoubleGradKernel, plat::complex<float>,
-    ops::ReshapeDoubleGradKernel, plat::complex<double>,
-    ops::ReshapeDoubleGradKernel, plat::bfloat16, ops::ReshapeDoubleGradKernel);
-#endif
-#ifdef PADDLE_WITH_XPU
-REGISTER_OP_XPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
-                               ops::ReshapeKernel, int, ops::ReshapeKernel,
-                               int64_t, ops::ReshapeKernel, plat::float16,
-                               ops::ReshapeKernel, bool, ops::ReshapeKernel,
-                               plat::complex<float>, ops::ReshapeKernel,
-                               plat::complex<double>, ops::ReshapeKernel);
-REGISTER_OP_XPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
-                               double, ops::ReshapeGradKernel, int,
-                               ops::ReshapeGradKernel, int64_t,
-                               ops::ReshapeGradKernel, plat::float16,
-                               ops::ReshapeGradKernel, bool,
-                               ops::ReshapeGradKernel, plat::complex<float>,
-                               ops::ReshapeGradKernel, plat::complex<double>,
-                               ops::ReshapeGradKernel);
 #endif
--- a/paddle/fluid/operators/test_common_infer_shape_functions.cc
+++ b/paddle/fluid/operators/test_common_infer_shape_functions.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
 USE_OP(relu);
-USE_OP(elementwise_add);
+USE_OP_ITSELF(elementwise_add);
 USE_OP(softmax);
 namespace paddle {

--- a/paddle/pten/kernels/selected_rows/scale_kernel.cc
+++ b/paddle/pten/kernels/selected_rows/scale_kernel.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,10 +16,8 @@ limitations under the License. */
 #include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/backends/gpu/gpu_context.h"
-#include "paddle/pten/core/kernel_registry.h"
-// See Note [ Why still include the fluid headers? ]
 #include "paddle/pten/common/bfloat16.h"
+#include "paddle/pten/core/kernel_registry.h"
 namespace pten {
 template <typename T, typename Context>

--- a/paddle/pten/ops/compat/elementwise_sig.cc
+++ b/paddle/pten/ops/compat/elementwise_sig.cc
@@ -75,6 +75,31 @@ KernelSignature ElementwiseAddGradOpArgumentMapping(
  return KernelSignature("unregistered", {}, {}, {});
 }
+KernelSignature ElementwiseAddDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "add_double_grad", {"Y", "DDX", "DDY", "DOut"}, {"axis"}, {"DDOut"});
+}
+KernelSignature ElementwiseAddTripleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("add_triple_grad",
+                         {"DDX", "DDY", "D_DDOut"},
+                         {"axis"},
+                         {"D_DDX", "D_DDY"});
+}
+KernelSignature ElementwiseSubGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("X")) {
+    return KernelSignature("subtract_grad",
+                           {"X", "Y", GradVarName("Out")},
+                           {"axis"},
+                           {GradVarName("X"), GradVarName("Y")});
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
 }  // namespace pten
 PT_REGISTER_BASE_KERNEL_NAME(elementwise_add, add);
@@ -82,6 +107,9 @@ PT_REGISTER_BASE_KERNEL_NAME(elementwise_sub, subtract);
 PT_REGISTER_BASE_KERNEL_NAME(elementwise_mul, multiply);
 PT_REGISTER_BASE_KERNEL_NAME(elementwise_div, divide);
 PT_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad, add_grad);
+PT_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad_grad, add_double_grad);
+PT_REGISTER_BASE_KERNEL_NAME(elementwise_add_triple_grad, add_triple_grad);
+PT_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad, subtract_grad);
 PT_REGISTER_ARG_MAPPING_FN(elementwise_add,
                           pten::ElementwiseAddOpArgumentMapping);
@@ -93,3 +121,9 @@ PT_REGISTER_ARG_MAPPING_FN(elementwise_div,
                           pten::ElementwiseDivOpArgumentMapping);
 PT_REGISTER_ARG_MAPPING_FN(elementwise_add_grad,
                           pten::ElementwiseAddGradOpArgumentMapping);
+PT_REGISTER_ARG_MAPPING_FN(elementwise_add_grad_grad,
+                           pten::ElementwiseAddDoubleGradOpArgumentMapping);
+PT_REGISTER_ARG_MAPPING_FN(elementwise_add_triple_grad,
+                           pten::ElementwiseAddTripleGradOpArgumentMapping);
+PT_REGISTER_ARG_MAPPING_FN(elementwise_sub_grad,
+                           pten::ElementwiseSubGradOpArgumentMapping);