register fluid kerenls to phi [part 7] (#52577)

* update * fix bug * fix ci-windows-openblas * fix test_partial_sum_op * fix codestyle

register fluid kerenls to phi [part 7] (#52577)
* update * fix bug * fix ci-windows-openblas * fix test_partial_sum_op * fix codestyle
aa35331f · huangjiyi · GitHub · 6913feb0 · aa35331f · aa35331f
59 changed file
--- a/paddle/fluid/operators/collective/partial_allgather_op.cc
+++ b/paddle/fluid/operators/collective/partial_allgather_op.cc
@@ -85,9 +85,12 @@ REGISTER_OPERATOR(
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
    ops::PartialAllGatherOpInplaceInferer)

-REGISTER_OP_CPU_KERNEL(partial_allgather,
-                       ops::PartialAllGatherOpCPUKernel<float>,
-                       ops::PartialAllGatherOpCPUKernel<double>,
-                       ops::PartialAllGatherOpCPUKernel<int>,
-                       ops::PartialAllGatherOpCPUKernel<int64_t>,
-                       ops::PartialAllGatherOpCPUKernel<plat::float16>);
+PD_REGISTER_STRUCT_KERNEL(partial_allgather,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PartialAllGatherOpCPUKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
--- a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialAllGatherOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -102,12 +102,16 @@ class PartialAllGatherOpCUDAKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;

-REGISTER_OP_CUDA_KERNEL(partial_allgather,
-                        ops::PartialAllGatherOpCUDAKernel<float>,
+PD_REGISTER_STRUCT_KERNEL(partial_allgather,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PartialAllGatherOpCUDAKernel,
+                          float,
+                          double,
 #if NCCL_VERSION_CODE >= 21000
-                        ops::PartialAllGatherOpCUDAKernel<plat::bfloat16>,
+                          plat::bfloat16,
 #endif
-                        ops::PartialAllGatherOpCUDAKernel<double>,
-                        ops::PartialAllGatherOpCUDAKernel<int>,
-                        ops::PartialAllGatherOpCUDAKernel<int64_t>,
-                        ops::PartialAllGatherOpCUDAKernel<plat::float16>);
+                          int,
+                          int64_t,
+                          plat::float16) {
+}
--- a/paddle/fluid/operators/collective/partial_allgather_op.h
+++ b/paddle/fluid/operators/collective/partial_allgather_op.h
@@ -26,7 +26,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialAllGatherOpCPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {

--- a/paddle/fluid/operators/collective/partial_recv_op.cc
+++ b/paddle/fluid/operators/collective/partial_recv_op.cc
@@ -129,9 +129,12 @@ REGISTER_OP_WITHOUT_GRADIENT(partial_recv,
                             ops::PartialRecvOp,
                             ops::PartialRecvOpMaker);

-REGISTER_OP_CPU_KERNEL(partial_recv,
-                       ops::PartialRecvOpCPUKernel<float>,
-                       ops::PartialRecvOpCPUKernel<double>,
-                       ops::PartialRecvOpCPUKernel<int>,
-                       ops::PartialRecvOpCPUKernel<int64_t>,
-                       ops::PartialRecvOpCPUKernel<plat::float16>);
+PD_REGISTER_STRUCT_KERNEL(partial_recv,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PartialRecvOpCPUKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
--- a/paddle/fluid/operators/collective/partial_recv_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_recv_op.cu.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialRecvOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
@@ -118,12 +118,16 @@ class PartialRecvOpCUDAKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;

-REGISTER_OP_CUDA_KERNEL(partial_recv,
-                        ops::PartialRecvOpCUDAKernel<float>,
+PD_REGISTER_STRUCT_KERNEL(partial_recv,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PartialRecvOpCUDAKernel,
+                          float,
+                          double,
 #if NCCL_VERSION_CODE >= 21000
-                        ops::PartialRecvOpCUDAKernel<plat::bfloat16>,
+                          plat::bfloat16,
 #endif
-                        ops::PartialRecvOpCUDAKernel<double>,
-                        ops::PartialRecvOpCUDAKernel<int>,
-                        ops::PartialRecvOpCUDAKernel<int64_t>,
-                        ops::PartialRecvOpCUDAKernel<plat::float16>);
+                          int,
+                          int64_t,
+                          plat::float16) {
+}
--- a/paddle/fluid/operators/collective/partial_recv_op.h
+++ b/paddle/fluid/operators/collective/partial_recv_op.h
@@ -24,7 +24,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialRecvOpCPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {

--- a/paddle/fluid/operators/collective/partial_send_op.cc
+++ b/paddle/fluid/operators/collective/partial_send_op.cc
@@ -94,9 +94,12 @@ REGISTER_OP_WITHOUT_GRADIENT(partial_send,
                             ops::PartialSendOp,
                             ops::PartialSendMaker);

-REGISTER_OP_CPU_KERNEL(partial_send,
-                       ops::PartialSendOpCPUKernel<float>,
-                       ops::PartialSendOpCPUKernel<double>,
-                       ops::PartialSendOpCPUKernel<int>,
-                       ops::PartialSendOpCPUKernel<int64_t>,
-                       ops::PartialSendOpCPUKernel<plat::float16>);
+PD_REGISTER_STRUCT_KERNEL(partial_send,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PartialSendOpCPUKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
--- a/paddle/fluid/operators/collective/partial_send_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_send_op.cu.cc
@@ -24,7 +24,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialSendCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -117,12 +117,16 @@ class PartialSendCUDAKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;

-REGISTER_OP_CUDA_KERNEL(partial_send,
-                        ops::PartialSendCUDAKernel<float>,
-                        ops::PartialSendCUDAKernel<double>,
+PD_REGISTER_STRUCT_KERNEL(partial_send,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PartialSendCUDAKernel,
+                          float,
+                          double,
 #if NCCL_VERSION_CODE >= 21000
-                        ops::PartialSendCUDAKernel<plat::bfloat16>,
+                          plat::bfloat16,
 #endif
-                        ops::PartialSendCUDAKernel<int>,
-                        ops::PartialSendCUDAKernel<int64_t>,
-                        ops::PartialSendCUDAKernel<plat::float16>);
+                          int,
+                          int64_t,
+                          plat::float16) {
+}
--- a/paddle/fluid/operators/collective/partial_send_op.h
+++ b/paddle/fluid/operators/collective/partial_send_op.h
@@ -25,7 +25,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialSendOpCPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {

--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cc
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class PolygonBoxTransformCPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -111,7 +111,10 @@ REGISTER_OPERATOR(
    ops::PolygonBoxTransformOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    polygon_box_transform,
-    ops::PolygonBoxTransformCPUKernel<paddle::platform::CPUPlace, float>,
-    ops::PolygonBoxTransformCPUKernel<paddle::platform::CPUPlace, double>);
+
+PD_REGISTER_STRUCT_KERNEL(polygon_box_transform,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PolygonBoxTransformCPUKernel,
+                          float,
+                          double) {}
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cu
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
@@ -38,7 +38,7 @@ __global__ void PolygonBoxTransformKernel(
  }
 }

-template <typename T>
+template <typename T, typename DeviceContext>
 class PolygonBoxTransformOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -73,7 +73,10 @@ class PolygonBoxTransformOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle

-REGISTER_OP_CUDA_KERNEL(
-    polygon_box_transform,
-    paddle::operators::PolygonBoxTransformOpCUDAKernel<float>,
-    paddle::operators::PolygonBoxTransformOpCUDAKernel<double>);
+namespace ops = paddle::operators;
+PD_REGISTER_STRUCT_KERNEL(polygon_box_transform,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PolygonBoxTransformOpCUDAKernel,
+                          float,
+                          double) {}
--- a/paddle/fluid/operators/metrics/precision_recall_op.cc
+++ b/paddle/fluid/operators/metrics/precision_recall_op.cc
@@ -242,7 +242,9 @@ REGISTER_OPERATOR(
    ops::PrecisionRecallOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    precision_recall,
-    ops::PrecisionRecallKernel<paddle::platform::CPUPlace, float>,
-    ops::PrecisionRecallKernel<paddle::platform::CPUPlace, double>);
+PD_REGISTER_STRUCT_KERNEL(precision_recall,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PrecisionRecallKernel,
+                          float,
+                          double) {}
--- a/paddle/fluid/operators/metrics/precision_recall_op.h
+++ b/paddle/fluid/operators/metrics/precision_recall_op.h
@@ -26,7 +26,7 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;

 enum StateVariable { TP = 0, FP, TN, FN };

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class PrecisionRecallKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {

--- a/paddle/fluid/operators/nccl/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op.cu.cc
@@ -52,7 +52,7 @@ static ncclRedOp_t str_to_nccl_red_type(std::string reduction) {
  return it->second;
 }

-template <typename T>
+template <typename T, typename DeviceContext>
 class NCCLAllReduceKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -87,7 +87,7 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
  }
 };

-template <typename T>
+template <typename T, typename DeviceContext>
 class NCCLReduceKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -128,7 +128,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
  }
 };

-template <typename T>
+template <typename T, typename DeviceContext>
 class NCCLBcastKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -172,6 +172,9 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel<float>);
-REGISTER_OP_CUDA_KERNEL(ncclBcast, ops::NCCLBcastKernel<float>);
-REGISTER_OP_CUDA_KERNEL(ncclReduce, ops::NCCLReduceKernel<float>);
+PD_REGISTER_STRUCT_KERNEL(
+    ncclAllReduce, GPU, ALL_LAYOUT, ops::NCCLAllReduceKernel, float) {}
+PD_REGISTER_STRUCT_KERNEL(
+    ncclBcast, GPU, ALL_LAYOUT, ops::NCCLBcastKernel, float) {}
+PD_REGISTER_STRUCT_KERNEL(
+    ncclReduce, GPU, ALL_LAYOUT, ops::NCCLReduceKernel, float) {}
--- a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
@@ -31,9 +31,12 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"

 USE_NO_KERNEL_OP(ncclInit);
-USE_CUDA_ONLY_OP(ncclAllReduce);
-USE_CUDA_ONLY_OP(ncclReduce);
-USE_CUDA_ONLY_OP(ncclBcast);
+USE_OP_ITSELF(ncclAllReduce);
+USE_OP_ITSELF(ncclReduce);
+USE_OP_ITSELF(ncclBcast);
+PD_DECLARE_KERNEL(ncclAllReduce, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(ncclReduce, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(ncclBcast, GPU, ALL_LAYOUT);

 namespace f = paddle::framework;
 namespace p = paddle::platform;

--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -320,9 +320,8 @@ REGISTER_OPERATOR(nce_grad,
                  ops::NCEOpGrad,
                  ops::NCEOpGradVarTypeInference,
                  ops::NCEGradOpNoNeedBufferVarInferer);
-REGISTER_OP_CPU_KERNEL(nce,
-                       ops::NCEKernel<paddle::platform::CPUPlace, float>,
-                       ops::NCEKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(nce_grad,
-                       ops::NCEGradKernel<paddle::platform::CPUPlace, float>,
-                       ops::NCEGradKernel<paddle::platform::CPUPlace, double>);
+
+PD_REGISTER_STRUCT_KERNEL(nce, CPU, ALL_LAYOUT, ops::NCEKernel, float, double) {
+}
+PD_REGISTER_STRUCT_KERNEL(
+    nce_grad, CPU, ALL_LAYOUT, ops::NCEGradKernel, float, double) {}
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -75,7 +75,7 @@ void PrepareSamples(const framework::ExecutionContext &context,
  }
 }

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class NCEKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &context) const override {
@@ -245,7 +245,7 @@ class NCEKernel : public framework::OpKernel<T> {
  }
 };

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class NCEGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &context) const override {

--- a/paddle/fluid/operators/nop_op.cc
+++ b/paddle/fluid/operators/nop_op.cc
@@ -45,7 +45,7 @@ establish the dependency between input and output tensors.
  }
 };

-template <typename T>
+template <typename T, typename DeviceContext>
 class NopKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {}
@@ -58,8 +58,8 @@ namespace ops = paddle::operators;

 REGISTER_OP_WITHOUT_GRADIENT(nop, ops::NopOp, ops::NopOpMaker);

-REGISTER_OP_CPU_KERNEL(nop, ops::NopKernel<float>);
+PD_REGISTER_STRUCT_KERNEL(nop, CPU, ALL_LAYOUT, ops::NopKernel, float) {}

-REGISTER_OP_CUDA_KERNEL(nop, ops::NopKernel<float>);
-
-REGISTER_OP_NPU_KERNEL(nop, ops::NopKernel<float>);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_STRUCT_KERNEL(nop, GPU, ALL_LAYOUT, ops::NopKernel, float) {}
+#endif
--- a/paddle/fluid/operators/number_count_op.cc
+++ b/paddle/fluid/operators/number_count_op.cc
@@ -58,10 +58,9 @@ class NumberCountOpMaker : public framework::OpProtoAndCheckerMaker {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;

-REGISTER_OP_CPU_KERNEL(number_count,
-                       ops::NumberCountOpCPUKernel<int>,
-                       ops::NumberCountOpCPUKernel<int64_t>);
-
 REGISTER_OP_WITHOUT_GRADIENT(number_count,
                             ops::NumberCountOp,
                             ops::NumberCountOpMaker);
+
+PD_REGISTER_STRUCT_KERNEL(
+    number_count, CPU, ALL_LAYOUT, ops::NumberCountOpCPUKernel, int, int64_t) {}
--- a/paddle/fluid/operators/number_count_op.cu
+++ b/paddle/fluid/operators/number_count_op.cu
@@ -79,7 +79,7 @@ __global__ void NumberCount(const T* numbers,
  }
 }

-template <typename T>
+template <typename T, typename DeviceContext>
 class NumberCountOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
@@ -111,4 +111,5 @@ class NumberCountOpCUDAKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;

-REGISTER_OP_CUDA_KERNEL(number_count, ops::NumberCountOpCUDAKernel<int64_t>);
+PD_REGISTER_STRUCT_KERNEL(
+    number_count, GPU, ALL_LAYOUT, ops::NumberCountOpCUDAKernel, int64_t) {}
--- a/paddle/fluid/operators/number_count_op.h
+++ b/paddle/fluid/operators/number_count_op.h
@@ -24,7 +24,7 @@
 namespace paddle {
 namespace operators {

-template <typename T>
+template <typename T, typename DeviceContext>
 class NumberCountOpCPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {

--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
@@ -133,5 +133,5 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(proximal_adagrad,
                             ops::ProximalAdagradOp,
                             ops::ProximalAdagradOpMaker);
-REGISTER_OP_CPU_KERNEL(proximal_adagrad,
-                       ops::ProximalAdagradOpKernel<phi::CPUContext, float>);
+PD_REGISTER_STRUCT_KERNEL(
+    proximal_adagrad, CPU, ALL_LAYOUT, ops::ProximalAdagradOpKernel, float) {}
--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu
+++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu
@@ -13,5 +13,5 @@ specific language governing permissions and limitations under the License. */
 #include "paddle/fluid/operators/optimizers/proximal_adagrad_op.h"

 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(proximal_adagrad,
-                        ops::ProximalAdagradOpKernel<phi::GPUContext, float>);
+PD_REGISTER_STRUCT_KERNEL(
+    proximal_adagrad, GPU, ALL_LAYOUT, ops::ProximalAdagradOpKernel, float) {}
--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.h
+++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class ProximalAdagradOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {

--- a/paddle/fluid/operators/optimizers/proximal_gd_op.cc
+++ b/paddle/fluid/operators/optimizers/proximal_gd_op.cc
@@ -106,5 +106,6 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(proximal_gd,
                             ops::ProximalGDOp,
                             ops::ProximalGDOpMaker);
-REGISTER_OP_CPU_KERNEL(proximal_gd,
-                       ops::ProximalGDOpKernel<phi::CPUContext, float>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    proximal_gd, CPU, ALL_LAYOUT, ops::ProximalGDOpKernel, float) {}
--- a/paddle/fluid/operators/optimizers/proximal_gd_op.cu
+++ b/paddle/fluid/operators/optimizers/proximal_gd_op.cu
@@ -13,5 +13,5 @@ specific language governing permissions and limitations under the License. */
 #include "paddle/fluid/operators/optimizers/proximal_gd_op.h"

 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(proximal_gd,
-                        ops::ProximalGDOpKernel<phi::GPUContext, float>);
+PD_REGISTER_STRUCT_KERNEL(
+    proximal_gd, GPU, ALL_LAYOUT, ops::ProximalGDOpKernel, float) {}
--- a/paddle/fluid/operators/optimizers/proximal_gd_op.h
+++ b/paddle/fluid/operators/optimizers/proximal_gd_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class ProximalGDOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {

--- a/paddle/fluid/operators/pad2d_op.cc
+++ b/paddle/fluid/operators/pad2d_op.cc
@@ -402,7 +402,7 @@ static inline void GetPaddings(int* paddings,
  }
 }

-template <typename T>
+template <typename T, typename DeviceContext>
 class Pad2dCPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
@@ -520,7 +520,7 @@ class Pad2dCPUKernel : public framework::OpKernel<T> {
  }
 };

-template <typename T>
+template <typename T, typename DeviceContext>
 class Pad2dGradCPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
@@ -873,11 +873,8 @@ REGISTER_OPERATOR(pad2d,
 REGISTER_OPERATOR(pad2d_grad,
                  ops::Pad2dOpGrad,
                  ops::Pad2dOpGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(pad2d,
-                       ops::Pad2dCPUKernel<float>,
-                       ops::Pad2dCPUKernel<double>,
-                       ops::Pad2dCPUKernel<int>,
-                       ops::Pad2dCPUKernel<int64_t>);
-REGISTER_OP_CPU_KERNEL(pad2d_grad,
-                       ops::Pad2dGradCPUKernel<float>,
-                       ops::Pad2dGradCPUKernel<double>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    pad2d, CPU, ALL_LAYOUT, ops::Pad2dCPUKernel, float, double, int, int64_t) {}
+PD_REGISTER_STRUCT_KERNEL(
+    pad2d_grad, CPU, ALL_LAYOUT, ops::Pad2dGradCPUKernel, float, double) {}
--- a/paddle/fluid/operators/pad2d_op.cu
+++ b/paddle/fluid/operators/pad2d_op.cu
@@ -361,7 +361,7 @@ static inline void GetPaddings(int* paddings,
  }
 }

-template <typename T>
+template <typename T, typename DeviceContext>
 class Pad2dCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
@@ -489,7 +489,7 @@ class Pad2dCUDAKernel : public framework::OpKernel<T> {
  }
 };

-template <typename T>
+template <typename T, typename DeviceContext>
 class Pad2dGradCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
@@ -618,13 +618,19 @@ class Pad2dGradCUDAKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;

-REGISTER_OP_CUDA_KERNEL(pad2d,
-                        ops::Pad2dCUDAKernel<plat::float16>,
-                        ops::Pad2dCUDAKernel<float>,
-                        ops::Pad2dCUDAKernel<double>,
-                        ops::Pad2dCUDAKernel<int>,
-                        ops::Pad2dCUDAKernel<int64_t>);
-REGISTER_OP_CUDA_KERNEL(pad2d_grad,
-                        ops::Pad2dGradCUDAKernel<plat::float16>,
-                        ops::Pad2dGradCUDAKernel<float>,
-                        ops::Pad2dGradCUDAKernel<double>);
+PD_REGISTER_STRUCT_KERNEL(pad2d,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::Pad2dCUDAKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
+PD_REGISTER_STRUCT_KERNEL(pad2d_grad,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::Pad2dGradCUDAKernel,
+                          float,
+                          double,
+                          plat::float16) {}
--- a/paddle/fluid/operators/pad_constant_like_op.cc
+++ b/paddle/fluid/operators/pad_constant_like_op.cc
@@ -243,26 +243,38 @@ REGISTER_OPERATOR(pad_constant_like,
                  ops::PadConstantLikeOpGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(pad_constant_like_grad, ops::PadConstantLikeOpGrad);

-REGISTER_OP_CPU_KERNEL(pad_constant_like,
-                       ops::PadConstantLikeKernel<phi::CPUContext, float>,
-                       ops::PadConstantLikeKernel<phi::CPUContext, double>,
-                       ops::PadConstantLikeKernel<phi::CPUContext, int>,
-                       ops::PadConstantLikeKernel<phi::CPUContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    pad_constant_like_grad,
-    ops::PadConstantLikeGradKernel<phi::CPUContext, float>,
-    ops::PadConstantLikeGradKernel<phi::CPUContext, double>,
-    ops::PadConstantLikeGradKernel<phi::CPUContext, int>,
-    ops::PadConstantLikeGradKernel<phi::CPUContext, int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(pad_constant_like,
-                        ops::PadConstantLikeKernel<phi::GPUContext, float>,
-                        ops::PadConstantLikeKernel<phi::GPUContext, double>,
-                        ops::PadConstantLikeKernel<phi::GPUContext, int>,
-                        ops::PadConstantLikeKernel<phi::GPUContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    pad_constant_like_grad,
-    ops::PadConstantLikeGradKernel<phi::GPUContext, int>,
-    ops::PadConstantLikeGradKernel<phi::GPUContext, int64_t>,
-    ops::PadConstantLikeGradKernel<phi::GPUContext, float>,
-    ops::PadConstantLikeGradKernel<phi::GPUContext, double>);
+PD_REGISTER_STRUCT_KERNEL(pad_constant_like,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PadConstantLikeKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
+PD_REGISTER_STRUCT_KERNEL(pad_constant_like_grad,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PadConstantLikeGradKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_STRUCT_KERNEL(pad_constant_like,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PadConstantLikeKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
+PD_REGISTER_STRUCT_KERNEL(pad_constant_like_grad,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PadConstantLikeGradKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
+#endif
--- a/paddle/fluid/operators/pad_constant_like_op.h
+++ b/paddle/fluid/operators/pad_constant_like_op.h
@@ -26,7 +26,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class PadConstantLikeKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
@@ -61,7 +61,7 @@ class PadConstantLikeKernel : public framework::OpKernel<T> {
  }
 };

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class PadConstantLikeGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {

--- a/paddle/fluid/operators/partial_concat_op.cc
+++ b/paddle/fluid/operators/partial_concat_op.cc
@@ -202,14 +202,19 @@ REGISTER_OPERATOR(partial_concat,

 REGISTER_OPERATOR(partial_concat_grad, ops::PartialConcatGradOp);

-REGISTER_OP_CPU_KERNEL(partial_concat,
-                       ops::PartialConcatKernel<phi::CPUContext, double>,
-                       ops::PartialConcatKernel<phi::CPUContext, float>,
-                       ops::PartialConcatKernel<phi::CPUContext, int64_t>,
-                       ops::PartialConcatKernel<phi::CPUContext, int>);
-
-REGISTER_OP_CPU_KERNEL(partial_concat_grad,
-                       ops::PartialConcatGradientOpKernel<float>,
-                       ops::PartialConcatGradientOpKernel<int>,
-                       ops::PartialConcatGradientOpKernel<double>,
-                       ops::PartialConcatGradientOpKernel<int64_t>);
+PD_REGISTER_STRUCT_KERNEL(partial_concat,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PartialConcatKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
+PD_REGISTER_STRUCT_KERNEL(partial_concat_grad,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PartialConcatGradientOpKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
--- a/paddle/fluid/operators/partial_concat_op.cu
+++ b/paddle/fluid/operators/partial_concat_op.cu
@@ -65,7 +65,7 @@ __global__ void ConcatPartialGradCUDAKernel(T **in,
  }
 }

-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialConcatOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
@@ -146,7 +146,7 @@ class PartialConcatOpCUDAKernel : public framework::OpKernel<T> {
  }
 };

-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialConcatGradOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
@@ -231,16 +231,22 @@ class PartialConcatGradOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(partial_concat,
-                        ops::PartialConcatOpCUDAKernel<float>,
-                        ops::PartialConcatOpCUDAKernel<double>,
-                        ops::PartialConcatOpCUDAKernel<int>,
-                        ops::PartialConcatOpCUDAKernel<int64_t>,
-                        ops::PartialConcatOpCUDAKernel<plat::float16>);
-
-REGISTER_OP_CUDA_KERNEL(partial_concat_grad,
-                        ops::PartialConcatGradOpCUDAKernel<float>,
-                        ops::PartialConcatGradOpCUDAKernel<double>,
-                        ops::PartialConcatGradOpCUDAKernel<int>,
-                        ops::PartialConcatGradOpCUDAKernel<int64_t>,
-                        ops::PartialConcatGradOpCUDAKernel<plat::float16>);
+
+PD_REGISTER_STRUCT_KERNEL(partial_concat,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PartialConcatOpCUDAKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
+PD_REGISTER_STRUCT_KERNEL(partial_concat_grad,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PartialConcatGradOpCUDAKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
--- a/paddle/fluid/operators/partial_concat_op.h
+++ b/paddle/fluid/operators/partial_concat_op.h
@@ -39,7 +39,7 @@ static inline int64_t ComputeStartIndex(int64_t start_index, int64_t size) {
  return start_index;
 }

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class PartialConcatKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -84,7 +84,7 @@ class PartialConcatKernel : public framework::OpKernel<T> {
  }
 };

-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialConcatGradientOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {

--- a/paddle/fluid/operators/partial_sum_op.cc
+++ b/paddle/fluid/operators/partial_sum_op.cc
@@ -204,14 +204,19 @@ REGISTER_OPERATOR(partial_sum,

 REGISTER_OPERATOR(partial_sum_grad, ops::PartialSumGradOp);

-REGISTER_OP_CPU_KERNEL(partial_sum,
-                       ops::PartialSumKernel<phi::CPUContext, float>,
-                       ops::PartialSumKernel<phi::CPUContext, int>,
-                       ops::PartialSumKernel<phi::CPUContext, double>,
-                       ops::PartialSumKernel<phi::CPUContext, int64_t>);
-
-REGISTER_OP_CPU_KERNEL(partial_sum_grad,
-                       ops::PartialSumGradientOpKernel<float>,
-                       ops::PartialSumGradientOpKernel<int>,
-                       ops::PartialSumGradientOpKernel<double>,
-                       ops::PartialSumGradientOpKernel<int64_t>);
+PD_REGISTER_STRUCT_KERNEL(partial_sum,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PartialSumKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
+PD_REGISTER_STRUCT_KERNEL(partial_sum_grad,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PartialSumGradientOpKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
--- a/paddle/fluid/operators/partial_sum_op.cu
+++ b/paddle/fluid/operators/partial_sum_op.cu
@@ -70,7 +70,7 @@ __global__ void PartialSumGradCUDAKernel(T **res_grad,
  }
 }

-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialSumOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
@@ -144,7 +144,7 @@ class PartialSumOpCUDAKernel : public framework::OpKernel<T> {
  }
 };

-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialSumGradOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
@@ -233,18 +233,3 @@ class PartialSumGradOpCUDAKernel : public framework::OpKernel<T> {

 }  // namespace operators
 }  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(partial_sum,
-                        ops::PartialSumOpCUDAKernel<float>,
-                        ops::PartialSumOpCUDAKernel<double>,
-                        ops::PartialSumOpCUDAKernel<int>,
-                        ops::PartialSumOpCUDAKernel<int64_t>,
-                        ops::PartialSumOpCUDAKernel<plat::float16>);
-
-REGISTER_OP_CUDA_KERNEL(partial_sum_grad,
-                        ops::PartialSumGradOpCUDAKernel<float>,
-                        ops::PartialSumGradOpCUDAKernel<double>,
-                        ops::PartialSumGradOpCUDAKernel<int>,
-                        ops::PartialSumGradOpCUDAKernel<int64_t>,
-                        ops::PartialSumGradOpCUDAKernel<plat::float16>);
--- a/paddle/fluid/operators/partial_sum_op.h
+++ b/paddle/fluid/operators/partial_sum_op.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class PartialSumKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -57,7 +57,7 @@ class PartialSumKernel : public framework::OpKernel<T> {
  }
 };

-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialSumGradientOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {

--- a/paddle/fluid/operators/positive_negative_pair_op.cc
+++ b/paddle/fluid/operators/positive_negative_pair_op.cc
@@ -253,7 +253,10 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(positive_negative_pair,
                             ops::PositiveNegativePairOp,
                             ops::PositiveNegativePairOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    positive_negative_pair,
-    ops::PositiveNegativePairKernel<paddle::platform::CPUPlace, float>,
-    ops::PositiveNegativePairKernel<paddle::platform::CPUPlace, double>);
+
+PD_REGISTER_STRUCT_KERNEL(positive_negative_pair,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PositiveNegativePairKernel,
+                          float,
+                          double) {}
--- a/paddle/fluid/operators/positive_negative_pair_op.h
+++ b/paddle/fluid/operators/positive_negative_pair_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class PositiveNegativePairKernel : public framework::OpKernel<T> {
 public:
  struct PredictionResult {

--- a/paddle/fluid/operators/prroi_pool_op.cc
+++ b/paddle/fluid/operators/prroi_pool_op.cc
@@ -195,13 +195,20 @@ REGISTER_OPERATOR(prroi_pool,
                  ops::PRROIPoolGradMaker<paddle::framework::OpDesc>,
                  ops::PRROIPoolGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(prroi_pool_grad, ops::PRROIPoolGradOp);
-REGISTER_OP_CPU_KERNEL(prroi_pool,
-                       ops::CPUPRROIPoolOpKernel<phi::CPUContext, float>,
-                       ops::CPUPRROIPoolOpKernel<phi::CPUContext, double>,
-                       ops::CPUPRROIPoolOpKernel<phi::CPUContext, int>,
-                       ops::CPUPRROIPoolOpKernel<phi::CPUContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(prroi_pool_grad,
-                       ops::CPUPRROIPoolGradOpKernel<phi::CPUContext, float>,
-                       ops::CPUPRROIPoolGradOpKernel<phi::CPUContext, double>,
-                       ops::CPUPRROIPoolGradOpKernel<phi::CPUContext, int>,
-                       ops::CPUPRROIPoolGradOpKernel<phi::CPUContext, int64_t>);
+
+PD_REGISTER_STRUCT_KERNEL(prroi_pool,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::CPUPRROIPoolOpKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
+PD_REGISTER_STRUCT_KERNEL(prroi_pool_grad,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::CPUPRROIPoolGradOpKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
--- a/paddle/fluid/operators/prroi_pool_op.cu
+++ b/paddle/fluid/operators/prroi_pool_op.cu
@@ -211,7 +211,7 @@ __global__ void GPUPRROIPoolBackward(const int nthreads,
  }
 }

-template <typename T>
+template <typename T, typename DeviceContext>
 class GPUPRROIPoolOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -314,7 +314,7 @@ class GPUPRROIPoolOpKernel : public framework::OpKernel<T> {
  }
 };

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class GPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -428,9 +428,12 @@ class GPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(prroi_pool,
-                        ops::GPUPRROIPoolOpKernel<float>,
-                        ops::GPUPRROIPoolOpKernel<double>);
-REGISTER_OP_CUDA_KERNEL(prroi_pool_grad,
-                        ops::GPUPRROIPoolGradOpKernel<phi::GPUContext, float>,
-                        ops::GPUPRROIPoolGradOpKernel<phi::GPUContext, double>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    prroi_pool, GPU, ALL_LAYOUT, ops::GPUPRROIPoolOpKernel, float, double) {}
+PD_REGISTER_STRUCT_KERNEL(prroi_pool_grad,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::GPUPRROIPoolGradOpKernel,
+                          float,
+                          double) {}
--- a/paddle/fluid/operators/prroi_pool_op.h
+++ b/paddle/fluid/operators/prroi_pool_op.h
@@ -327,7 +327,7 @@ inline HOSTDEVICE void PrRoIPoolingCoorBackward(int s_w,
                        (*this_out_grad));
 }

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class CPUPRROIPoolOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -481,7 +481,7 @@ class CPUPRROIPoolOpKernel : public framework::OpKernel<T> {
  }
 };

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class CPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {

--- a/paddle/fluid/operators/prune_gate_by_capacity_op.cc
+++ b/paddle/fluid/operators/prune_gate_by_capacity_op.cc
@@ -126,7 +126,9 @@ REGISTER_OP_WITHOUT_GRADIENT(prune_gate_by_capacity,
                             ops::PruneGateByCapacityOp,
                             ops::PruneGateByCapacityOpMaker);

-REGISTER_OP_CPU_KERNEL(
-    prune_gate_by_capacity,
-    ops::PruneGateByCapacityCPUKernel<phi::CPUContext, int>,
-    ops::PruneGateByCapacityCPUKernel<phi::CPUContext, int64_t>);
+PD_REGISTER_STRUCT_KERNEL(prune_gate_by_capacity,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PruneGateByCapacityCPUKernel,
+                          int,
+                          int64_t) {}
--- a/paddle/fluid/operators/prune_gate_by_capacity_op.cu
+++ b/paddle/fluid/operators/prune_gate_by_capacity_op.cu
@@ -105,7 +105,7 @@ static void VisitDataType(phi::DataType type, Visitor visitor) {
  }
 }

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class PruneGateByCapacityCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
@@ -127,6 +127,8 @@ class PruneGateByCapacityCUDAKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle

-REGISTER_OP_CUDA_KERNEL(
-    prune_gate_by_capacity,
-    ops::PruneGateByCapacityCUDAKernel<phi::GPUContext, int64_t>);
+PD_REGISTER_STRUCT_KERNEL(prune_gate_by_capacity,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PruneGateByCapacityCUDAKernel,
+                          int64_t) {}
--- a/paddle/fluid/operators/prune_gate_by_capacity_op.h
+++ b/paddle/fluid/operators/prune_gate_by_capacity_op.h
@@ -20,7 +20,7 @@
 namespace paddle {
 namespace operators {

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class PruneGateByCapacityCPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {

--- a/paddle/fluid/operators/pull_box_extended_sparse_op.cc
+++ b/paddle/fluid/operators/pull_box_extended_sparse_op.cc
@@ -151,10 +151,15 @@ REGISTER_OPERATOR(

 REGISTER_OPERATOR(push_box_extended_sparse, ops::PushBoxExtendedSparseOp);

-REGISTER_OP_CPU_KERNEL(pull_box_extended_sparse,
-                       ops::PullBoxExtendedSparseCPUKernel<float>,
-                       ops::PullBoxExtendedSparseCPUKernel<double>);
-
-REGISTER_OP_CPU_KERNEL(push_box_extended_sparse,
-                       ops::PushBoxExtendedSparseCPUKernel<float>,
-                       ops::PushBoxExtendedSparseCPUKernel<double>);
+PD_REGISTER_STRUCT_KERNEL(pull_box_extended_sparse,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PullBoxExtendedSparseCPUKernel,
+                          float,
+                          double) {}
+PD_REGISTER_STRUCT_KERNEL(push_box_extended_sparse,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PushBoxExtendedSparseCPUKernel,
+                          float,
+                          double) {}
--- a/paddle/fluid/operators/pull_box_extended_sparse_op.cu
+++ b/paddle/fluid/operators/pull_box_extended_sparse_op.cu
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace operators {

-template <typename T>
+template <typename T, typename DeviceContext>
 class PullBoxExtendedSparseCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
@@ -27,7 +27,7 @@ class PullBoxExtendedSparseCUDAKernel : public framework::OpKernel<T> {
  }
 };

-template <typename T>
+template <typename T, typename DeviceContext>
 class PushBoxExtendedSparseCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
@@ -38,9 +38,16 @@ class PushBoxExtendedSparseCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(pull_box_extended_sparse,
-                        ops::PullBoxExtendedSparseCUDAKernel<float>,
-                        ops::PullBoxExtendedSparseCUDAKernel<double>);
-REGISTER_OP_CUDA_KERNEL(push_box_extended_sparse,
-                        ops::PushBoxExtendedSparseCUDAKernel<float>,
-                        ops::PushBoxExtendedSparseCUDAKernel<double>);
+
+PD_REGISTER_STRUCT_KERNEL(pull_box_extended_sparse,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PullBoxExtendedSparseCUDAKernel,
+                          float,
+                          double) {}
+PD_REGISTER_STRUCT_KERNEL(push_box_extended_sparse,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PushBoxExtendedSparseCUDAKernel,
+                          float,
+                          double) {}
--- a/paddle/fluid/operators/pull_box_extended_sparse_op.h
+++ b/paddle/fluid/operators/pull_box_extended_sparse_op.h
@@ -108,7 +108,7 @@ static void PushBoxExtendedSparseFunctor(
 #endif
 }

-template <typename T>
+template <typename T, typename DeviceContext>
 class PullBoxExtendedSparseCPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
@@ -116,7 +116,7 @@ class PullBoxExtendedSparseCPUKernel : public framework::OpKernel<T> {
  }
 };

-template <typename T>
+template <typename T, typename DeviceContext>
 class PushBoxExtendedSparseCPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {

--- a/paddle/fluid/operators/pull_box_sparse_op.cc
+++ b/paddle/fluid/operators/pull_box_sparse_op.cc
@@ -135,5 +135,8 @@ REGISTER_OPERATOR(pull_box_sparse,
                  ops::PushBoxSparseOpMaker<paddle::framework::OpDesc>,
                  ops::PushBoxSparseOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(push_box_sparse, ops::PushBoxSparseOp);
-REGISTER_OP_CPU_KERNEL(pull_box_sparse, ops::PullBoxSparseKernel<float>);
-REGISTER_OP_CPU_KERNEL(push_box_sparse, ops::PushBoxSparseKernel<float>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    pull_box_sparse, CPU, ALL_LAYOUT, ops::PullBoxSparseKernel, float) {}
+PD_REGISTER_STRUCT_KERNEL(
+    push_box_sparse, CPU, ALL_LAYOUT, ops::PushBoxSparseKernel, float) {}
--- a/paddle/fluid/operators/pull_box_sparse_op.h
+++ b/paddle/fluid/operators/pull_box_sparse_op.h
@@ -113,7 +113,7 @@ static void PushBoxSparseFunctor(const framework::ExecutionContext &ctx) {
 #endif
 }

-template <typename T>
+template <typename T, typename DeviceContext>
 class PullBoxSparseKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
@@ -121,7 +121,7 @@ class PullBoxSparseKernel : public framework::OpKernel<T> {
  }
 };

-template <typename T>
+template <typename T, typename DeviceContext>
 class PushBoxSparseKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {

--- a/paddle/fluid/operators/pull_box_sparse_op.kps
+++ b/paddle/fluid/operators/pull_box_sparse_op.kps
@@ -45,16 +45,7 @@ limitations under the License. */
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;

-#ifdef PADDLE_WITH_XPU_KP
-REGISTER_OP_KERNEL(pull_box_sparse,
-                   KP,
-                   plat::XPUPlace,
-                   ops::PullBoxSparseKernel<float>);
-REGISTER_OP_KERNEL(push_box_sparse,
-                   KP,
-                   plat::XPUPlace,
-                   ops::PushBoxSparseKernel<float>);
-#else
-REGISTER_OP_CUDA_KERNEL(pull_box_sparse, ops::PullBoxSparseKernel<float>);
-REGISTER_OP_CUDA_KERNEL(push_box_sparse, ops::PushBoxSparseKernel<float>);
-#endif
+PD_REGISTER_STRUCT_KERNEL(
+    pull_box_sparse, KPS, ALL_LAYOUT, ops::PullBoxSparseKernel, float) {}
+PD_REGISTER_STRUCT_KERNEL(
+    push_box_sparse, KPS, ALL_LAYOUT, ops::PushBoxSparseKernel, float) {}
--- a/paddle/fluid/operators/pull_gpups_sparse_op.cc
+++ b/paddle/fluid/operators/pull_gpups_sparse_op.cc
@@ -145,9 +145,16 @@ REGISTER_OPERATOR(pull_gpups_sparse,
                  ops::PushGpuPSSparseOpMaker<paddle::framework::OpDesc>,
                  ops::PushGpuPSSparseOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(push_gpups_sparse, ops::PushGpuPSSparseOp);
-REGISTER_OP_CPU_KERNEL(pull_gpups_sparse,
-                       ops::PullGpuPSSparseCPUKernel<float>,
-                       ops::PullGpuPSSparseCPUKernel<double>)
-REGISTER_OP_CPU_KERNEL(push_gpups_sparse,
-                       ops::PushGpuPSSparseCPUKernel<float>,
-                       ops::PushGpuPSSparseCPUKernel<double>)
+
+PD_REGISTER_STRUCT_KERNEL(pull_gpups_sparse,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PullGpuPSSparseCPUKernel,
+                          float,
+                          double) {}
+PD_REGISTER_STRUCT_KERNEL(push_gpups_sparse,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PushGpuPSSparseCPUKernel,
+                          float,
+                          double) {}
--- a/paddle/fluid/operators/pull_gpups_sparse_op.cu
+++ b/paddle/fluid/operators/pull_gpups_sparse_op.cu
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 using phi::PADDLE_CUDA_NUM_THREADS;

-template <typename T>
+template <typename T, typename DeviceContext>
 class PullGpuPSSparseCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
@@ -28,7 +28,7 @@ class PullGpuPSSparseCUDAKernel : public framework::OpKernel<T> {
  }
 };

-template <typename T>
+template <typename T, typename DeviceContext>
 class PushGpuPSSparseCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
@@ -39,9 +39,15 @@ class PushGpuPSSparseCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(pull_gpups_sparse,
-                        ops::PullGpuPSSparseCUDAKernel<float>,
-                        ops::PullGpuPSSparseCUDAKernel<double>)
-REGISTER_OP_CUDA_KERNEL(push_gpups_sparse,
-                        ops::PushGpuPSSparseCUDAKernel<float>,
-                        ops::PushGpuPSSparseCUDAKernel<double>)
+PD_REGISTER_STRUCT_KERNEL(pull_gpups_sparse,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PullGpuPSSparseCUDAKernel,
+                          float,
+                          double) {}
+PD_REGISTER_STRUCT_KERNEL(push_gpups_sparse,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PushGpuPSSparseCUDAKernel,
+                          float,
+                          double) {}
--- a/paddle/fluid/operators/pull_gpups_sparse_op.h
+++ b/paddle/fluid/operators/pull_gpups_sparse_op.h
@@ -97,7 +97,7 @@ static void PushGpuPSSparseFunctor(const framework::ExecutionContext &ctx) {
 #endif
 }

-template <typename T>
+template <typename T, typename DeviceContext>
 class PullGpuPSSparseCPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
@@ -105,7 +105,7 @@ class PullGpuPSSparseCPUKernel : public framework::OpKernel<T> {
  }
 };

-template <typename T>
+template <typename T, typename DeviceContext>
 class PushGpuPSSparseCPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {

--- a/paddle/fluid/operators/pull_sparse_op.cc
+++ b/paddle/fluid/operators/pull_sparse_op.cc
@@ -143,5 +143,7 @@ REGISTER_OPERATOR(pull_sparse,
                  ops::PushSparseOpMaker<paddle::framework::OpDesc>,
                  ops::PushSparseOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(push_sparse, ops::PushSparseOp);
-REGISTER_OP_CPU_KERNEL(pull_sparse, ops::PullSparseCPUKernel<float>)
-REGISTER_OP_CPU_KERNEL(push_sparse, ops::PushSparseCPUKernel<float>)
+PD_REGISTER_STRUCT_KERNEL(
+    pull_sparse, CPU, ALL_LAYOUT, ops::PullSparseCPUKernel, float) {}
+PD_REGISTER_STRUCT_KERNEL(
+    push_sparse, CPU, ALL_LAYOUT, ops::PushSparseCPUKernel, float) {}
--- a/paddle/fluid/operators/pull_sparse_op.h
+++ b/paddle/fluid/operators/pull_sparse_op.h
@@ -66,7 +66,7 @@ void PushSparseFunctor(const framework::ExecutionContext& ctx) {
                                                &grads);
 }

-template <typename T>
+template <typename T, typename DeviceContext>
 class PullSparseCPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -74,7 +74,7 @@ class PullSparseCPUKernel : public framework::OpKernel<T> {
  }
 };

-template <typename T>
+template <typename T, typename DeviceContext>
 class PushSparseCPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {

--- a/paddle/fluid/operators/pull_sparse_v2_op.cc
+++ b/paddle/fluid/operators/pull_sparse_v2_op.cc
@@ -135,5 +135,7 @@ REGISTER_OPERATOR(pull_sparse_v2,
                  ops::PushSparseV2OpMaker<paddle::framework::OpDesc>,
                  ops::PushSparseV2OpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(push_sparse_v2, ops::PushSparseV2Op);
-REGISTER_OP_CPU_KERNEL(pull_sparse_v2, ops::PullSparseV2CPUKernel<float>)
-REGISTER_OP_CPU_KERNEL(push_sparse_v2, ops::PushSparseV2CPUKernel<float>)
+PD_REGISTER_STRUCT_KERNEL(
+    pull_sparse_v2, CPU, ALL_LAYOUT, ops::PullSparseV2CPUKernel, float) {}
+PD_REGISTER_STRUCT_KERNEL(
+    push_sparse_v2, CPU, ALL_LAYOUT, ops::PushSparseV2CPUKernel, float) {}
--- a/paddle/fluid/operators/pull_sparse_v2_op.h
+++ b/paddle/fluid/operators/pull_sparse_v2_op.h
@@ -25,7 +25,7 @@
 namespace paddle {
 namespace operators {

-template <typename T>
+template <typename T, typename DeviceContext>
 class PullSparseV2CPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -33,7 +33,7 @@ class PullSparseV2CPUKernel : public framework::OpKernel<T> {
  }
 };

-template <typename T>
+template <typename T, typename DeviceContext>
 class PushSparseV2CPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {

--- a/paddle/fluid/operators/unity_build_rule.cmake
+++ b/paddle/fluid/operators/unity_build_rule.cmake
@@ -202,7 +202,6 @@ register_unity_group(
  pad_op.cc)
 register_unity_group(
  cc
-  modified_huber_loss_op.cc
  partial_sum_op.cc
  pixel_shuffle_op.cc
  pool_op.cc